diff --git a/.clang-format b/.clang-format index 4bdc2426..b68024fe 100644 --- a/.clang-format +++ b/.clang-format @@ -1,57 +1,246 @@ -BasedOnStyle: Google - -# Allow double brackets such as std::vector>. -Standard: Cpp11 - -# Indent 4 spaces at a time. -IndentWidth: 4 - -# Keep lines under 100 columns long. -ColumnLimit: 100 - -# Always break before braces -BreakBeforeBraces: Custom +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseColons: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowBreakBeforeNoexceptSpecifier: Never +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortCompoundRequirementOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: Both BraceWrapping: - AfterCaseLabel: true - AfterClass: true - AfterControlStatement: true - AfterEnum: true - AfterFunction: true - AfterNamespace: true - AfterStruct: true - AfterUnion: true - BeforeCatch: true - BeforeElse: true - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false - - # Keeps extern "C" blocks unindented. + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false AfterExternBlock: false - -# Indent case labels. + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakAdjacentStringLiterals: true +BreakAfterAttributes: Leave +BreakAfterJavaFieldAnnotations: false +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Attach +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: false IndentCaseLabels: false - -# Right-align pointers and references -PointerAlignment: Right - -# ANGLE likes to align things as much as possible. -AlignOperands: true -AlignConsecutiveAssignments: true - -# Use 0 space negative offset for access modifiers -AccessModifierOffset: -4 - -AllowShortCaseLabelsOnASingleLine: true - -# Useful for spacing out functions in classes +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true KeepEmptyLinesAtTheStartOfBlocks: true +KeepEmptyLinesAtEOF: false +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PackConstructorInitializers: BinPack +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakScopeResolution: 500 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Right +PPIndentWidth: -1 +QualifierAlignment: Leave +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveParentheses: Leave +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SkipMacroDefinitionBody: false +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeJsonColon: false +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + AfterPlacementOperator: true + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParens: Never +SpacesInParensOptions: + InCStyleCasts: false + InConditionalStatements: false + InEmptyParentheses: false + Other: false +SpacesInSquareBrackets: false +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +VerilogBreakBetweenInstancePorts: true +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +... -# Indent nested PP directives. -IndentPPDirectives: AfterHash - -# Include blocks style -IncludeBlocks: Preserve - -KeepEmptyLinesAtTheStartOfBlocks: false \ No newline at end of file diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 4c458487..cd985957 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -11,10 +11,11 @@ jobs: - 'src' - 'benchmark/src' - 'test' + - 'python/src' steps: - uses: actions/checkout@v3 - name: Run clang-format style check for C/C++/Protobuf programs. - uses: jidicula/clang-format-action@v4.10.1 + uses: jidicula/clang-format-action@v4.13.0 with: - clang-format-version: '13' + clang-format-version: '18' check-path: ${{ matrix.path }} \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 00c387a2..6fd97f09 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,19 +16,21 @@ jobs: # well on Windows or Mac. You can convert this to a matrix build if you need # cross-platform coverage. # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 - - name: Install gcc-11 + - name: Install gcc run: | sudo apt update - sudo apt install -y wget build-essential manpages-dev software-properties-common gcc g++ libboost-all-dev libgflags-dev + sudo apt install -y wget build-essential manpages-dev software-properties-common gcc g++ - name: Configure CMake # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type - run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} + run: | + git submodule update --init --recursive + cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} - name: Build # Build your program with the given configuration diff --git a/.gitignore b/.gitignore index 5d84fe75..80feffb0 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,8 @@ *.iml .DS_Store /practice/Clustream/cmake-build-debug/ +*.pyc +*.tmp /cmake-build-debug/* /cmake-build-release/* @@ -35,4 +37,6 @@ test/datasets/ *.egg-info/ *.out -results.txt +*.txt +*.whl +/dist/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..10271add --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "third_party/pybind11"] + path = third_party/pybind11 + url = https://github.com/pybind/pybind11 +[submodule "third_party/gflags"] + path = third_party/gflags + url = https://github.com/gflags/gflags diff --git a/CMakeLists.txt b/CMakeLists.txt index eab597d3..e92a5bbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,21 +11,16 @@ endif() set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}") include(cmake/macros.cmake) include(cmake/default.cmake) +include(FetchContent) # C++ Standard set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) -#gcc 10 g++10 -message(STATUS "sudo add-apt-repository 'deb http://mirrors.kernel.org/ubuntu hirsute main universe'") -message(STATUS "sudo apt-get update") -message(STATUS "sudo apt install gcc-11 g++-11") -message(STATUS "sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11") -message(STATUS "sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11") # Set Optimization Flags -set(CMAKE_CXX_FLAGS "-g -std=c++20 -Wall -Werror=return-type -fconcepts-diagnostics-depth=2 -lpthread -fopenmp -march=native") +set(CMAKE_CXX_FLAGS "-g -std=c++20 -Wall -fconcepts-diagnostics-depth=2 -fopenmp -march=native") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -DNO_RACE_CHECK -DSESAME_DEBUG_MODE=1 -DDEBUG") -set(CMAKE_CXX_FLAGS_RELEASE "-Wno-ignored-qualifiers -Wno-sign-compare -O3 -DNDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-Wno-ignored-qualifiers -Wno-sign-compare -O3 -DNDEBUG -flto=auto") # Set LOGGING_LEVEL Flag if (SESAME_LOGGING_LEVEL) @@ -41,28 +36,41 @@ message(STATUS "CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message(STATUS "CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}") message(STATUS "CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}") -option(ENABLE_UNIT_TESTS "Enable unit tests" ON) -message(STATUS "Enable testing: ${ENABLE_UNIT_TESTS}") +option(ENABLE_TESTS "Enable unit tests" ON) +message(STATUS "Enable testing: ${ENABLE_TESTS}") -if (ENABLE_UNIT_TESTS) +if (ENABLE_TESTS) enable_testing() # Google Test - include(FetchContent) - FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG release-1.11.0 + GIT_TAG v1.14.0 ) - set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) include(GoogleTest) -# find_package(GTest 1.11.0) endif () + +# set(Boost_DEBUG 1) +set(BOOST_INCLUDE_LIBRARIES timer lockfree) +set(BOOST_ENABLE_CMAKE ON) +FetchContent_Declare( + Boost + URL https://github.com/boostorg/boost/releases/download/boost-1.84.0/boost-1.84.0.tar.xz + URL_MD5 893b5203b862eb9bbd08553e24ff146a + DOWNLOAD_NO_EXTRACT FALSE + EXCLUDE_FROM_ALL +) +FetchContent_MakeAvailable(Boost) + +set(GFLAGS_BUILD_SHARED_LIBS OFF CACHE BOOL "") +set(GFLAGS_BUILD_gflags_nothreads_LIB ON) +set(GFLAGS_BUILD_gflags_LIB OFF CACHE BOOL "") +add_subdirectory(third_party/gflags) + find_package(OpenMP) -find_package(gflags REQUIRED) # Print all used include directories message(STATUS "INCLUDE_DIRS:") @@ -77,10 +85,11 @@ add_subdirectory(src) # Add Library get_source_sesame(sesame_SOURCE_FILES) get_header_sesame(sesame_HEADER_FILES) -add_library(sesame STATIC ${sesame_SOURCE_FILES} ${sesame_HEADER_FILES} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +add_library(sesame SHARED ${sesame_SOURCE_FILES} ${sesame_HEADER_FILES} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories(sesame PUBLIC "include") target_include_directories(sesame PUBLIC "${CMAKE_CURRENT_BINARY_DIR}") -target_link_libraries(sesame PUBLIC ${LIBRARIES} pthread gflags) +target_link_libraries(sesame PUBLIC ${LIBRARIES} pthread Boost::timer Boost::lockfree) +target_link_libraries(sesame PRIVATE gflags) #Add benchmarks with command add_subdirectory(benchmark) @@ -90,3 +99,15 @@ add_subdirectory(test) install(DIRECTORY "include" DESTINATION "/sesame" COMPONENT SESAME) +option(ENABLE_PYTHON "Enable Python bindings" OFF) +message(STATUS "Enable Python bindings: ${ENABLE_PYTHON}") +if (ENABLE_PYTHON) + add_subdirectory(third_party/pybind11) + pybind11_add_module(pysame python/src/Pysame.cpp) + target_include_directories(pysame PRIVATE "include") + target_link_libraries(pysame PUBLIC sesame) + set_target_properties(pysame PROPERTIES + BUILD_RPATH "\$ORIGIN/" + INSTALL_RPATH "\$ORIGIN/" + ) +endif () \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..9b39b839 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,9 @@ +include CMakeLists.txt +include libsesame.so +graft src +graft include +graft python +graft cmake +graft third_party +graft benchmark +graft test \ No newline at end of file diff --git a/benchmark/src/Benchmark.cpp b/benchmark/src/Benchmark.cpp index eeec58d3..baa3f123 100644 --- a/benchmark/src/Benchmark.cpp +++ b/benchmark/src/Benchmark.cpp @@ -57,7 +57,9 @@ DEFINE_int32(num_last_arr, 2, "Number of last arrive"); DEFINE_int32(time_window, 50, "Time window"); DEFINE_int32(num_online_clusters, 80, "Number of online clusters"); // SL-KMeans -DEFINE_double(delta_grid, 0.2, "The delta parameter used int the grid for guessing the optimum."); +DEFINE_double( + delta_grid, 0.2, + "The delta parameter used int the grid for guessing the optimum."); DEFINE_int32(num_samples, 10, "Number of samples"); // Generic DEFINE_int32(landmark, 10000, "Landmark"); @@ -78,75 +80,75 @@ DEFINE_int32(queue_size_threshold, 10000, "Benne queue size threshold"); DEFINE_int32(dim_threshold, 30, "Benne dimension threshold"); DEFINE_double(variance_threshold, 100.0, "Benne variance threshold"); DEFINE_int32(outliers_num_threshold, 200, "Benne outliers threshold"); -DEFINE_double(outliers_dist_threshold, 50.0, "Benne outliers distance threshold"); +DEFINE_double(outliers_dist_threshold, 50.0, + "Benne outliers distance threshold"); -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { #ifndef NDEBUG - std::cerr << "\033[1;31m#####################################################" - "#######\n" - << "# #\n" - << "# DON'T run benchmark in debug mode. #\n" - << "# #\n" - << "############################################################" - "\033[0m\n"; - sleep(1); + std::cerr << "\033[1;31m#####################################################" + "#######\n" + << "# #\n" + << "# DON'T run benchmark in debug mode. #\n" + << "# #\n" + << "############################################################" + "\033[0m\n"; + sleep(1); #endif - // Parse parameters. - gflags::ParseCommandLineFlags(&argc, &argv, true); - param_t param; - param.algo = (AlgoType)FLAGS_algo; - param.algo = (AlgoType)FLAGS_algo; - param.input_file = FLAGS_input_file; - param.num_points = FLAGS_num_points; - param.dim = FLAGS_dim; - param.num_clusters = FLAGS_num_clusters; - param.max_in_nodes = FLAGS_max_in_nodes; - param.max_leaf_nodes = FLAGS_max_leaf_nodes; - param.distance_threshold = FLAGS_distance_threshold; - param.seed = FLAGS_seed; - param.coreset_size = FLAGS_coreset_size; - param.radius = FLAGS_radius; - param.delta = FLAGS_delta; - param.beta = FLAGS_beta; - param.buf_size = FLAGS_buf_size; - param.alpha = FLAGS_alpha; - param.lambda = FLAGS_lambda; - param.clean_interval = FLAGS_clean_interval; - param.min_weight = FLAGS_min_weight; - param.base = FLAGS_base; - param.cm = FLAGS_cm; - param.cl = FLAGS_cl; - param.grid_width = FLAGS_grid_width; - param.min_points = FLAGS_min_points; - param.epsilon = FLAGS_epsilon; - param.mu = FLAGS_mu; - param.num_last_arr = FLAGS_num_last_arr; - param.time_window = FLAGS_time_window; - param.num_online_clusters = FLAGS_num_online_clusters; - param.delta_grid = FLAGS_delta_grid; - param.num_samples = FLAGS_num_samples; - param.landmark = FLAGS_landmark; - param.sliding = FLAGS_sliding; - param.outlier_distance_threshold = FLAGS_outlier_distance_threshold; - param.outlier_cap = FLAGS_outlier_cap; - param.outlier_density_threshold = FLAGS_outlier_density_threshold; - param.neighbor_distance = FLAGS_neighbor_distance; - param.k = FLAGS_k; - param.arr_rate = FLAGS_arr_rate; - param.run_offline = FLAGS_run_offline; - param.run_eval = FLAGS_run_eval; - param.run_cmm = FLAGS_run_cmm; - param.run_pur = FLAGS_run_pur; - param.obj = (BenneObj)FLAGS_obj; - param.benne_threshold.dim = FLAGS_dim_threshold; - param.benne_threshold.queue_size = FLAGS_queue_size_threshold; - param.benne_threshold.variance = FLAGS_variance_threshold; - param.benne_threshold.outliers_num = FLAGS_outliers_num_threshold; - param.benne_threshold.outliers_dist = FLAGS_outliers_dist_threshold; + // Parse parameters. + gflags::ParseCommandLineFlags(&argc, &argv, true); + param_t param; + param.algo = (AlgoType)FLAGS_algo; + param.algo = (AlgoType)FLAGS_algo; + param.input_file = FLAGS_input_file; + param.num_points = FLAGS_num_points; + param.dim = FLAGS_dim; + param.num_clusters = FLAGS_num_clusters; + param.max_in_nodes = FLAGS_max_in_nodes; + param.max_leaf_nodes = FLAGS_max_leaf_nodes; + param.distance_threshold = FLAGS_distance_threshold; + param.seed = FLAGS_seed; + param.coreset_size = FLAGS_coreset_size; + param.radius = FLAGS_radius; + param.delta = FLAGS_delta; + param.beta = FLAGS_beta; + param.buf_size = FLAGS_buf_size; + param.alpha = FLAGS_alpha; + param.lambda = FLAGS_lambda; + param.clean_interval = FLAGS_clean_interval; + param.min_weight = FLAGS_min_weight; + param.base = FLAGS_base; + param.cm = FLAGS_cm; + param.cl = FLAGS_cl; + param.grid_width = FLAGS_grid_width; + param.min_points = FLAGS_min_points; + param.epsilon = FLAGS_epsilon; + param.mu = FLAGS_mu; + param.num_last_arr = FLAGS_num_last_arr; + param.time_window = FLAGS_time_window; + param.num_online_clusters = FLAGS_num_online_clusters; + param.delta_grid = FLAGS_delta_grid; + param.num_samples = FLAGS_num_samples; + param.landmark = FLAGS_landmark; + param.sliding = FLAGS_sliding; + param.outlier_distance_threshold = FLAGS_outlier_distance_threshold; + param.outlier_cap = FLAGS_outlier_cap; + param.outlier_density_threshold = FLAGS_outlier_density_threshold; + param.neighbor_distance = FLAGS_neighbor_distance; + param.k = FLAGS_k; + param.arr_rate = FLAGS_arr_rate; + param.run_offline = FLAGS_run_offline; + param.run_eval = FLAGS_run_eval; + param.run_cmm = FLAGS_run_cmm; + param.run_pur = FLAGS_run_pur; + param.obj = (BenneObj)FLAGS_obj; + param.benne_threshold.dim = FLAGS_dim_threshold; + param.benne_threshold.queue_size = FLAGS_queue_size_threshold; + param.benne_threshold.variance = FLAGS_variance_threshold; + param.benne_threshold.outliers_num = FLAGS_outliers_num_threshold; + param.benne_threshold.outliers_dist = FLAGS_outliers_dist_threshold; - param.fast_source = true; - param.store = false; + param.fast_source = true; + param.store = false; - RunBenchmark(param); + RunBenchmark(param); } diff --git a/build.sh b/build.sh new file mode 100755 index 00000000..f224f25e --- /dev/null +++ b/build.sh @@ -0,0 +1,5 @@ +for v in 3.9 3.10 3.11 3.12 3.13 +do +python$v -m pip install build +python$v -m build +done diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake new file mode 100644 index 00000000..d0fd0e8e --- /dev/null +++ b/cmake/CPM.cmake @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: MIT +# +# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors + +set(CPM_DOWNLOAD_VERSION 0.39.0) +set(CPM_HASH_SUM "66639bcac9dd2907b2918de466783554c1334446b9874e90d38e3778d404c2ef") + +if(CPM_SOURCE_CACHE) + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else() + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif() + +# Expand relative path. This is important if the provided path contains a tilde (~) +get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) + +file(DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM} +) + +include(${CPM_DOWNLOAD_LOCATION}) diff --git a/include/APIs/APIs.h b/include/APIs/APIs.h deleted file mode 100644 index 291a8c21..00000000 --- a/include/APIs/APIs.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) - -// -// Created by Shuhao Zhang on 19/07/2021. -// - -#ifndef SESAME_INCLUDE_APIS_APIS_H_ -#define SESAME_INCLUDE_APIS_APIS_H_ - -using namespace std; -namespace SESAME -{ - -class APIs -{}; -} // namespace SESAME - -#endif // SESAME_INCLUDE_APIS_APIS_H_ diff --git a/include/Algorithm/Algorithm.hpp b/include/Algorithm/Algorithm.hpp index 27f3a05b..960065e8 100644 --- a/include/Algorithm/Algorithm.hpp +++ b/include/Algorithm/Algorithm.hpp @@ -22,62 +22,56 @@ using namespace std; -namespace SESAME -{ - +namespace SESAME { class Algorithm; typedef std::shared_ptr AlgorithmPtr; -class Algorithm -{ +class Algorithm { public: - Algorithm() = default; - virtual ~Algorithm() = default; - virtual void Init() = 0; - virtual void RunOnline(SESAME::PointPtr input) = 0; - virtual void RunOffline(SESAME::DataSinkPtr ptr) = 0; - void Insert(SESAME::PointPtr input){}; - virtual void OutputOnline(std::vector ¢ers){}; - void Store(std::string output_file, int dim, std::vector results); - Timer win_timer, ds_timer, out_timer, ref_timer, sum_timer, lat_timer, on_timer; - param_t param; - int cnt = 0; - std::vector et; - PerfRes GetPerf() - { - PerfRes res; - res.win_us = win_timer.sum / 1000; - res.ds_us = ds_timer.sum / 1000; - res.out_us = out_timer.sum / 1000; - res.ref_us = ref_timer.sum / 1000; - res.sum_us = sum_timer.sum / 1000; - if (et.size() == 5) - { - res.on_20 = et[0] / 1e6; - res.on_40 = et[1] / 1e6; - res.on_60 = et[2] / 1e6; - res.on_80 = et[3] / 1e6; - res.on_100 = et[4] / 1e6; - } - res.lat_us = lat_timer.sum / 1e3 / param.num_points; - res.et_s = on_timer.sum / 1e9; - res.qps = param.num_points * 1e9 / sum_timer.sum; - return res; + Algorithm() = default; + virtual ~Algorithm() = default; + virtual void Init() = 0; + virtual void RunOnline(SESAME::PointPtr input) = 0; + virtual void RunOffline(SESAME::DataSinkPtr ptr) = 0; + void Insert(SESAME::PointPtr input) {}; + virtual void OutputOnline(std::vector ¢ers) {}; + void Store(std::string output_file, int dim, std::vector results); + Timer win_timer, ds_timer, out_timer, ref_timer, sum_timer, lat_timer, + on_timer; + param_t param; + int cnt = 0; + std::vector et; + PerfRes GetPerf() { + PerfRes res; + res.win_us = win_timer.sum / 1000; + res.ds_us = ds_timer.sum / 1000; + res.out_us = out_timer.sum / 1000; + res.ref_us = ref_timer.sum / 1000; + res.sum_us = sum_timer.sum / 1000; + if (et.size() == 5) { + res.on_20 = et[0] / 1e6; + res.on_40 = et[1] / 1e6; + res.on_60 = et[2] / 1e6; + res.on_80 = et[3] / 1e6; + res.on_100 = et[4] / 1e6; } - void Count() - { - ++cnt; - if (cnt >= param.num_points * 0.2) - { - auto now = std::chrono::high_resolution_clock::now(); - et.push_back( - std::chrono::duration_cast(now - sum_timer.start) - .count()); - cnt = 1; - } + res.lat_us = lat_timer.sum / 1e3 / param.num_points; + res.et_s = on_timer.sum / 1e9; + res.qps = param.num_points * 1e9 / sum_timer.sum; + return res; + } + void Count() { + ++cnt; + if (cnt >= param.num_points * 0.2) { + auto now = std::chrono::high_resolution_clock::now(); + et.push_back(std::chrono::duration_cast( + now - sum_timer.start) + .count()); + cnt = 1; } + } }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_ALGORITHM_HPP_ +#endif // SESAME_INCLUDE_ALGORITHM_ALGORITHM_HPP_ diff --git a/include/Algorithm/AlgorithmFactory.hpp b/include/Algorithm/AlgorithmFactory.hpp index 3aa19a72..856bad5f 100644 --- a/include/Algorithm/AlgorithmFactory.hpp +++ b/include/Algorithm/AlgorithmFactory.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 26/07/2021. @@ -10,12 +11,10 @@ #include #include -namespace SESAME -{ -class AlgorithmFactory -{ +namespace SESAME { +class AlgorithmFactory { public: - static SESAME::AlgorithmPtr create(param_t &cmd_params); + static SESAME::AlgorithmPtr create(param_t &cmd_params); }; -} // namespace SESAME -#endif // SESAME_SRC_ALGORITHM_ALGORITHMFACTORY_HPP_ +} // namespace SESAME +#endif // SESAME_SRC_ALGORITHM_ALGORITHMFACTORY_HPP_ diff --git a/include/Algorithm/Benne.hpp b/include/Algorithm/Benne.hpp index 9243193a..892c6db8 100644 --- a/include/Algorithm/Benne.hpp +++ b/include/Algorithm/Benne.hpp @@ -14,83 +14,55 @@ #include -namespace SESAME -{ -struct characteristics -{ - bool frequentDrift = false; - bool manyOutliers = false; - bool highDimension = false; +namespace SESAME { +struct characteristics { + bool frequentDrift = false; + bool manyOutliers = false; + bool highDimension = false; }; -enum windowSelection -{ - landmark = 0, - sliding = 1, - damped = 2 -}; -enum dataSelection -{ - MCs = 0, - CFT = 1, - CoreT = 2, - DPT = 3, - Grids = 4, - AMS = 5 -}; -enum outlierSelection -{ - OD = 0, - NoOD = 1, - ODB = 2, - ODT = 3, - ODBT = 4 -}; -enum refineSelection -{ - Incre = 0, - OneShot = 1, - NoRefine = 2 -}; -class Benne : public Algorithm -{ +enum windowSelection { landmark = 0, sliding = 1, damped = 2 }; +enum dataSelection { MCs = 0, CFT = 1, CoreT = 2, DPT = 3, Grids = 4, AMS = 5 }; +enum outlierSelection { OD = 0, NoOD = 1, ODB = 2, ODT = 3, ODBT = 4 }; +enum refineSelection { Incre = 0, OneShot = 1, NoRefine = 2 }; +class Benne : public Algorithm { public: - std::vector queue_; - std::vector materialized_centers; - // std::vector centers; - BenneThreshold T; - bool ds_changed = false; - AlgorithmPtr algo; - BenneObj obj; - characteristics chara; - windowSelection windowSel; - dataSelection dataSel; - outlierSelection outlierSel; - refineSelection refineSel; - KMeans kmeans; - int first_algo; - size_t change_count = 0; - std::vector> change_log; - Timer mig_timer, det_timer; + std::vector queue_; + std::vector materialized_centers; + // std::vector centers; + BenneThreshold T; + bool ds_changed = false; + AlgorithmPtr algo; + BenneObj obj; + characteristics chara; + windowSelection windowSel; + dataSelection dataSel; + outlierSelection outlierSel; + refineSelection refineSel; + KMeans kmeans; + int first_algo; + size_t change_count = 0; + std::vector> change_log; + Timer mig_timer, det_timer; - using MicroClusters = ClusteringFeaturesList; - static constexpr int INCRE_REF_CNT = 50000; + using MicroClusters = ClusteringFeaturesList; + static constexpr int INCRE_REF_CNT = 50000; - Benne(param_t &cmd_params); + Benne(param_t &cmd_params); - ~Benne(); + ~Benne(); - void Init() override; + void Init() override; - void RunOnline(PointPtr input) override; + void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + void RunOffline(DataSinkPtr sinkPtr) override; private: - void Train(const PointPtr &point); - int Infer(const SESAME::PointPtr &input); - void UpdateAlgo(int, int); + void Train(const PointPtr &point); + int Infer(const SESAME::PointPtr &input); + void UpdateAlgo(int, int); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_BENNE_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_BENNE_HPP_ diff --git a/include/Algorithm/Birch.hpp b/include/Algorithm/Birch.hpp index f9c2e2dc..35d929e8 100644 --- a/include/Algorithm/Birch.hpp +++ b/include/Algorithm/Birch.hpp @@ -10,50 +10,49 @@ #include #include #include -namespace SESAME -{ +namespace SESAME { -class BirchParameter : public SesameParam -{ +class BirchParameter : public SesameParam { public: - int max_in_nodes; // B - int max_leaf_nodes; // L - double distance_threshold; // T + int max_in_nodes; // B + int max_leaf_nodes; // L + double distance_threshold; // T }; -class Birch : public Algorithm -{ +class Birch : public Algorithm { public: - BirchParameter BirchParam; - std::shared_ptr kmeans; // used for offline initialization - int leafMask = 0; - NodePtr root; - vector leafNodes; - CFTreePtr cfTree; - Birch(param_t &cmd_params); + BirchParameter BirchParam; + std::shared_ptr kmeans; // used for offline initialization + int leafMask = 0; + NodePtr root; + vector leafNodes; + CFTreePtr cfTree; + Birch(param_t &cmd_params); - ~Birch(); + ~Birch(); - void Init() override; + void Init() override; - void RunOnline(PointPtr input) override; + void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + void RunOffline(DataSinkPtr sinkPtr) override; private: - void forwardInsert(PointPtr point); - void backwardEvolution(NodePtr &curNode, PointPtr &point); - void calculateCorDistance(vector> &distance, vector &nodes); - double calculateRadius(PointPtr &point, PointPtr ¢roid); - void selectChild(vector &children, PointPtr &insertPoint, NodePtr &node); - double clusterToClusterDist(NodePtr &nodeA, NodePtr &nodeB); - void pointToClusterDist(PointPtr &insertPoint, NodePtr &node, double &dist); - void calculateCentroid(CFPtr &cf, PointPtr ¢roid); - void updateNLS(NodePtr &node, PointPtr &point, bool updateAll); - void initializeCF(CFPtr &cf, int dim); - void setCFToBlankNode(SESAME::NodePtr &curNode, SESAME::PointPtr &point); - void addNodeNLSToNode(SESAME::NodePtr &child, SESAME::NodePtr &parent); - void clearChildParents(vector &children); + void forwardInsert(PointPtr point); + void backwardEvolution(NodePtr &curNode, PointPtr &point); + void calculateCorDistance(vector> &distance, + vector &nodes); + double calculateRadius(PointPtr &point, PointPtr ¢roid); + void selectChild(vector &children, PointPtr &insertPoint, + NodePtr &node); + double clusterToClusterDist(NodePtr &nodeA, NodePtr &nodeB); + void pointToClusterDist(PointPtr &insertPoint, NodePtr &node, double &dist); + void calculateCentroid(CFPtr &cf, PointPtr ¢roid); + void updateNLS(NodePtr &node, PointPtr &point, bool updateAll); + void initializeCF(CFPtr &cf, int dim); + void setCFToBlankNode(SESAME::NodePtr &curNode, SESAME::PointPtr &point); + void addNodeNLSToNode(SESAME::NodePtr &child, SESAME::NodePtr &parent); + void clearChildParents(vector &children); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_BIRCH_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_BIRCH_HPP_ diff --git a/include/Algorithm/CluStream.hpp b/include/Algorithm/CluStream.hpp index 7131455d..7b8bd5e6 100644 --- a/include/Algorithm/CluStream.hpp +++ b/include/Algorithm/CluStream.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by 1124a on 2021/8/16. @@ -21,57 +22,56 @@ #include #include -namespace SESAME -{ +namespace SESAME { -class CluStreamParameter : public SesameParam -{ +class CluStreamParameter : public SesameParam { public: - int num_last_arr; - int time_window; - unsigned int time_interval; - int num_clusters; // total number of micro clusters online - int num_offline_clusters; // total number of micro clusters online - double radius; // radius factor - int buf_size; - int offline_time_window; - int seed; + int num_last_arr; + int time_window; + unsigned int time_interval; + int num_clusters; // total number of micro clusters online + int num_offline_clusters; // total number of micro clusters online + double radius; // radius factor + int buf_size; + int offline_time_window; + int seed; }; const double doubleMax = std::numeric_limits::max(); -class CluStream : public Algorithm -{ +class CluStream : public Algorithm { public: - CluStreamParameter CluStreamParam; - std::shared_ptr kmeans; // used for offline initialization - LandmarkWindowPtr window; - MicroClusters microClusters; // Defined in Snapshot, std::vector - MicroClusters delMicroClusters; - int pointsFitted; - int pointsForgot; - int pointsMerged; - int startTime; - int lastUpdateTime; - CluStream(param_t &cmd_params); - ~CluStream(); + CluStreamParameter CluStreamParam; + std::shared_ptr kmeans; // used for offline initialization + LandmarkWindowPtr window; + MicroClusters + microClusters; // Defined in Snapshot, std::vector + MicroClusters delMicroClusters; + int pointsFitted; + int pointsForgot; + int pointsMerged; + int startTime; + int lastUpdateTime; + CluStream(param_t &cmd_params); + ~CluStream(); - void Init() override; - void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + void Init() override; + void RunOnline(PointPtr input) override; + void RunOffline(DataSinkPtr sinkPtr) override; private: - void initOffline(vector &initData, vector &initialData); - void incrementalCluster(PointPtr data); - double calRadius(MicroClusterPtr closestCluster); - void insertIntoCluster(PointPtr data, MicroClusterPtr closestCluster); - bool deleteCreateCluster(PointPtr data); - void MergeCreateCluster(PointPtr data); - void microClusterToPoint(MicroClusters µClusters, vector &points) const; - static double distance(dataPoint a, dataPoint b, int dim); + void initOffline(vector &initData, vector &initialData); + void incrementalCluster(PointPtr data); + double calRadius(MicroClusterPtr closestCluster); + void insertIntoCluster(PointPtr data, MicroClusterPtr closestCluster); + bool deleteCreateCluster(PointPtr data); + void MergeCreateCluster(PointPtr data); + void microClusterToPoint(MicroClusters µClusters, + vector &points) const; + static double distance(dataPoint a, dataPoint b, int dim); - bool initilized = false; - vector initialInputs; + bool initilized = false; + vector initialInputs; }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_CLUSTREAM_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_CLUSTREAM_HPP_ diff --git a/include/Algorithm/DBStream.hpp b/include/Algorithm/DBStream.hpp index 7ace7053..6d7b71b1 100644 --- a/include/Algorithm/DBStream.hpp +++ b/include/Algorithm/DBStream.hpp @@ -10,61 +10,59 @@ #include "Algorithm/OfflineRefinement/ConnectedRegions.hpp" #include "Utils/BenchmarkUtils.hpp" -namespace SESAME -{ +namespace SESAME { typedef std::vector> Clusters; -class DBStreamParams : public SesameParam -{ +class DBStreamParams : public SesameParam { public: - double radius; - double lambda; - int clean_interval; // Time gap - double min_weight; // minimum weight - double alpha; //α, intersection factor - double base; // base of decay function + double radius; + double lambda; + int clean_interval; // Time gap + double min_weight; // minimum weight + double alpha; // α, intersection factor + double base; // base of decay function }; -class DBStream : public Algorithm -{ +class DBStream : public Algorithm { public: - DBStreamParams dbStreamParams; - DampedWindowPtr dampedWindow; - std::vector microClusters; - SESAME::WeightedAdjacencyList weightedAdjacencyList; - std::vector - microClusterNN; // micro clusters found in function findFixedRadiusNN - double weakEntry; // W_weak, weak entries - double aWeakEntry; - timespec startTime; - timespec lastArrivingTime0; - timespec pointArrivingTime0; - timespec lastCleanTime0; + DBStreamParams dbStreamParams; + DampedWindowPtr dampedWindow; + std::vector microClusters; + SESAME::WeightedAdjacencyList weightedAdjacencyList; + std::vector + microClusterNN; // micro clusters found in function findFixedRadiusNN + double weakEntry; // W_weak, weak entries + double aWeakEntry; + timespec startTime; + timespec lastArrivingTime0; + timespec pointArrivingTime0; + timespec lastCleanTime0; - int lastArrivingTime; - int pointArrivingTime; - int lastCleanTime; - int microClusterIndex; - // Final output of clusters - Clusters finalClusters; - ConnectedRegions connectedRegions; - // Connectivity graph - // unordered_map> connecvtivityGraphId; + int lastArrivingTime; + int pointArrivingTime; + int lastCleanTime; + int microClusterIndex; + // Final output of clusters + Clusters finalClusters; + ConnectedRegions connectedRegions; + // Connectivity graph + // unordered_map> connecvtivityGraphId; - DBStream(param_t &cmd_params); - ~DBStream(); - void Init() override; - void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + DBStream(param_t &cmd_params); + ~DBStream(); + void Init() override; + void RunOnline(PointPtr input) override; + void RunOffline(DataSinkPtr sinkPtr) override; private: - bool isInitial = false; + bool isInitial = false; - void update(PointPtr dataPoint); - bool checkMove(std::vector microClusters) const; - std::vector findFixedRadiusNN(PointPtr dataPoint, double decayFactor); - void cleanUp(int nowTime); + void update(PointPtr dataPoint); + bool checkMove(std::vector microClusters) const; + std::vector findFixedRadiusNN(PointPtr dataPoint, + double decayFactor); + void cleanUp(int nowTime); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DBSTREAM_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DBSTREAM_HPP_ diff --git a/include/Algorithm/DStream.hpp b/include/Algorithm/DStream.hpp index 1198f897..112d981b 100644 --- a/include/Algorithm/DStream.hpp +++ b/include/Algorithm/DStream.hpp @@ -14,76 +14,84 @@ double lambda: user defined parameter lambda in damped window double beta: control the time interval of deleting the same sporadic grid double cm: controls the threshold for dense grids double cl: controls the threshold for sparse grid, require cm > cl -double grid_width: width of grid (default with the same width in every dimension) -double gap: self-updated based on cm and cl according to eq 26 in the paper -Note: -1.Since it is unrealistic to set the number of grids in the total feature space with a fixed value -ahead of time, in this implementation, we timely adjust N during the clustering procedure: -pi_(maxVals - minVals) / grid_width. 2.For simplicity, we directly use the data index as the its -arriving timestamp. +double grid_width: width of grid (default with the same width in every +dimension) double gap: self-updated based on cm and cl according to eq 26 in the +paper Note: 1.Since it is unrealistic to set the number of grids in the total +feature space with a fixed value ahead of time, in this implementation, we +timely adjust N during the clustering procedure: pi_(maxVals - minVals) / +grid_width. 2.For simplicity, we directly use the data index as the its arriving +timestamp. **/ -namespace SESAME -{ +namespace SESAME { class DStream; -typedef std::unordered_map HashMap; -class DStream : public Algorithm -{ +typedef std::unordered_map + HashMap; +class DStream : public Algorithm { public: - DampedWindowPtr dampedWindow; - double startTime = 0.0; - int currentTimeStamp; - int gap; // Time gap between calls to the offline component - double dm; // Density threshold for dense grids; controlled by cm - double dl; // Density threshold for sparse grids; controlled by cl - int NGrids; // The number of density grids ,with an initial value 0 - HashMap gridList; - std::unordered_map deletedGrids; - // Store the deleted sporadic grids: - std::vector clusterList; // A list of all Grid Clusters - std::vector - newClusterList; // A list of grid clusters used when re-clustering an existing cluster. - std::vector minVals; // The minimum value seen for a numerical dim; used to calculate N - std::vector maxVals; // The maximum value seen for a numerical dim; used to calculate N - bool init = false; + DampedWindowPtr dampedWindow; + double startTime = 0.0; + int currentTimeStamp; + int gap; // Time gap between calls to the offline component + double dm; // Density threshold for dense grids; controlled by cm + double dl; // Density threshold for sparse grids; controlled by cl + int NGrids; // The number of density grids ,with an initial value 0 + HashMap gridList; + std::unordered_map deletedGrids; + // Store the deleted sporadic grids: + std::vector clusterList; // A list of all Grid Clusters + std::vector newClusterList; // A list of grid clusters used when + // re-clustering an existing cluster. + std::vector minVals; // The minimum value seen for a numerical dim; + // used to calculate N + std::vector maxVals; // The maximum value seen for a numerical dim; + // used to calculate N + bool init = false; - DStream(param_t &cmd_params); - ~DStream(); - void Init() override; - void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + DStream(param_t &cmd_params); + ~DStream(); + void Init() override; + void RunOnline(PointPtr input) override; + void RunOffline(DataSinkPtr sinkPtr) override; private: - bool recalculateN = - false; // flag indicating whether N needs to be recalculated after this instance - std::vector Coord; - void ifReCalculate(PointPtr point); - void reCalculateParameter(); - void GridListUpdate(std::vector coordinate); - void initialClustering(); - void adjustClustering(); - bool adjustLabels(); - bool inspectChangedGrids(); - HashMap adjustForSparseGrid(DensityGrid grid, CharacteristicVector characteristicVec, - int gridClass); - HashMap adjustForDenseGrid(DensityGrid grid, CharacteristicVector characteristicVec, - int gridClass); - HashMap adjustForTransitionalGrid(DensityGrid grid, CharacteristicVector characteristicVec, - int gridClass); - void removeSporadic(); - HashMap reCluster(GridCluster gridCluster); - HashMap adjustNewLabels(HashMap newGridList); - void mergeClusters(int smallCluster, int bigCluster); - void cleanClusters(); - HashMap cleanNewClusters(HashMap newGridList); - HashMap mergeNewClusters(HashMap newGridList, int smallCluster, int bigCluster); - double outlier_density_thresholdFunction(int tg, double cl, double decayFactor, int NGrids); - bool checkIfSporadic(CharacteristicVector characteristicVec); - void updateGridListDensity(); - static void mergeGridList(HashMap &gridList, const HashMap &otherList); - // HashMap putHashMap(HashMap gList, const DensityGrid& g, CharacteristicVector cv); + bool recalculateN = false; // flag indicating whether N needs to be + // recalculated after this instance + std::vector Coord; + void ifReCalculate(PointPtr point); + void reCalculateParameter(); + void GridListUpdate(std::vector coordinate); + void initialClustering(); + void adjustClustering(); + bool adjustLabels(); + bool inspectChangedGrids(); + HashMap adjustForSparseGrid(DensityGrid grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap adjustForDenseGrid(DensityGrid grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap adjustForTransitionalGrid(DensityGrid grid, + CharacteristicVector characteristicVec, + int gridClass); + void removeSporadic(); + HashMap reCluster(GridCluster gridCluster); + HashMap adjustNewLabels(HashMap newGridList); + void mergeClusters(int smallCluster, int bigCluster); + void cleanClusters(); + HashMap cleanNewClusters(HashMap newGridList); + HashMap mergeNewClusters(HashMap newGridList, int smallCluster, + int bigCluster); + double outlier_density_thresholdFunction(int tg, double cl, + double decayFactor, int NGrids); + bool checkIfSporadic(CharacteristicVector characteristicVec); + void updateGridListDensity(); + static void mergeGridList(HashMap &gridList, const HashMap &otherList); + // HashMap putHashMap(HashMap gList, const DensityGrid& g, + // CharacteristicVector cv); }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DSTREAM_HPP_ +#endif // SESAME_INCLUDE_ALGORITHM_DSTREAM_HPP_ diff --git a/include/Algorithm/DataStructure/CFTree.hpp b/include/Algorithm/DataStructure/CFTree.hpp index 2e2abf25..bd8b9ce1 100644 --- a/include/Algorithm/DataStructure/CFTree.hpp +++ b/include/Algorithm/DataStructure/CFTree.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -20,294 +21,266 @@ #include "Algorithm/DataStructure/Point.hpp" #include "Algorithm/Param.hpp" -namespace SESAME -{ +namespace SESAME { // define the share point of the class object class CFNode; class CFTree; typedef std::shared_ptr NodePtr; typedef std::shared_ptr CFTreePtr; -class CFTree -{ +class CFTree { private: - int max_in_nodes; // max CF number of each internal node - int max_leaf_nodes; // max CF number of each leaf node - double distance_threshold; // threshold radius of each sub cluster in leaf nodes + int max_in_nodes; // max CF number of each internal node + int max_leaf_nodes; // max CF number of each leaf node + double + distance_threshold; // threshold radius of each sub cluster in leaf nodes public: - CFTree(const SesameParam ¶m); - CFTree(int b, int l, double t); - ~CFTree(); - int getB() const; - int getL() const; - double getT() const; - void setB(int b); - void setL(int l); - void setT(double t); + CFTree(const SesameParam ¶m); + CFTree(int b, int l, double t); + ~CFTree(); + int getB() const; + int getL() const; + double getT() const; + void setB(int b); + void setL(int l); + void setT(double t); }; -class CFNode -{ +class CFNode { private: - CFPtr curCF; - bool isLeaf; - bool outlier; - std::vector children; - NodePtr parent; - int index; + CFPtr curCF; + bool isLeaf; + bool outlier; + std::vector children; + NodePtr parent; + int index; public: - CFNode(); - ~CFNode(); - CFPtr getCF(); - void setCF(CFPtr &cf); - NodePtr getParent(); - int getIndex() const; - std::vector getChildren(); - void removeChild(NodePtr &child); - NodePtr copy(); - bool getIsLeaf(); - void setIsLeaf(bool leaf); - void setNode(CFPtr &Node); - void setIndex(int Index); - void setParent(NodePtr &Parent); - void setChild(NodePtr &child); - void setChildren(std::vector children); - void clearParents(); - void setOutlier(bool flag); - bool getOutlier(); - std::string Prefix(int d) - { - std::string prefix = ""; - while (d--) - { - prefix += "- "; - } - return prefix; - } - std::string Serialize(int d = 0) - { - std::string str = - Prefix(d) + std::to_string(index) + ":" + std::to_string(curCF->getN()) + "\n"; - return str; + CFNode(); + ~CFNode(); + CFPtr getCF(); + void setCF(CFPtr &cf); + NodePtr getParent(); + int getIndex() const; + std::vector getChildren(); + void removeChild(NodePtr &child); + NodePtr copy(); + bool getIsLeaf(); + void setIsLeaf(bool leaf); + void setNode(CFPtr &Node); + void setIndex(int Index); + void setParent(NodePtr &Parent); + void setChild(NodePtr &child); + void setChildren(std::vector children); + void clearParents(); + void setOutlier(bool flag); + bool getOutlier(); + std::string Prefix(int d) { + std::string prefix = ""; + while (d--) { + prefix += "- "; } + return prefix; + } + std::string Serialize(int d = 0) { + std::string str = Prefix(d) + std::to_string(index) + ":" + + std::to_string(curCF->getN()) + "\n"; + return str; + } }; -class ClusteringFeaturesTree : public std::enable_shared_from_this -{ +class ClusteringFeaturesTree + : public std::enable_shared_from_this { private: - const int max_in_nodes; // max CF number of each internal node - const int max_leaf_nodes; // max CF number of each leaf node - const double distance_threshold; // threshold radius of each sub cluster in leaf nodes - const int dim; - int leafMask = 0; + const int max_in_nodes; // max CF number of each internal node + const int max_leaf_nodes; // max CF number of each leaf node + const double + distance_threshold; // threshold radius of each sub cluster in leaf nodes + const int dim; + int leafMask = 0; public: - struct Node; - using NodePtr = std::shared_ptr; - using TreePtr = std::shared_ptr; - ClusteringFeaturesTree(const SesameParam ¶m); - ~ClusteringFeaturesTree(); - void Init(); - NodePtr Insert(PointPtr point); - NodePtr Insert(NodePtr node); - void Remove(NodePtr node); - void ForEach(std::function func); - std::string Serialize(); - std::vector &clusters(); - NodePtr root() { return root_; } + struct Node; + using NodePtr = std::shared_ptr; + using TreePtr = std::shared_ptr; + ClusteringFeaturesTree(const SesameParam ¶m); + ~ClusteringFeaturesTree(); + void Init(); + NodePtr Insert(PointPtr point); + NodePtr Insert(NodePtr node); + void Remove(NodePtr node); + void ForEach(std::function func); + std::string Serialize(); + std::vector &clusters(); + NodePtr root() { return root_; } private: - template - NodePtr backwardEvolution(NodePtr node, T point); - NodePtr root_; - std::vector clusters_; + template NodePtr backwardEvolution(NodePtr node, T point); + NodePtr root_; + std::vector clusters_; public: - struct Node : std::enable_shared_from_this - { - size_t timestamp = 0; - NodePtr parent = nullptr; - std::vector children; - int index = 0; - const int dim; - ClusteringFeatures cf; - TreePtr tree; + struct Node : std::enable_shared_from_this { + size_t timestamp = 0; + NodePtr parent = nullptr; + std::vector children; + int index = 0; + const int dim; + ClusteringFeatures cf; + TreePtr tree; - Node(TreePtr tree, int d = 0) : tree(tree), dim(d), cf(d){}; - Node(TreePtr tree, PointPtr p) : Node(tree, p->getDimension()) { Update(p); } - ~Node() = default; - void RemoveChild(NodePtr child) - { - const auto [first, last] = - std::ranges::remove_if(children, [&](auto &c) { return c == child; }); - children.erase(first, last); - } - bool IsLeaf() const { return children.empty(); } - void AddChild(NodePtr child) - { - children.push_back(child); - child->parent = shared_from_this(); - } - void ClearParents() - { - if (parent != nullptr) parent->index = -1; - } - void Update(PointPtr point) - { - cf.num += point->sgn; - for (int i = 0; i < dim; ++i) - { - auto val = point->getFeatureItem(i); - cf.ls[i] += val * point->sgn; - cf.ss[i] += (val * val) * point->sgn; - } - if (cf.num == 0) - { - if (tree != nullptr) tree->Remove(shared_from_this()); - } - } - void Update(NodePtr node) - { - cf.num += node->cf.num; - for (int i = 0; i < dim; ++i) - { - cf.ls[i] += node->cf.ls[i]; - cf.ss[i] += node->cf.ss[i] * node->cf.ss[i]; - } - } - void Scale(double scale) - { - for (int i = 0; i < dim; ++i) - { - cf.ls[i] *= scale; - cf.ss[i] *= scale * scale; - } - // auto a = cf.ls.data(), b = cf.ss.data(); - // auto factor1 = _mm256_set1_pd(scale), factor2 = _mm256_set1_pd(scale * - // scale); for (size_t i = 0; i < dim; i += 4) { - // _mm256_mul_pd(_mm256_loadu_pd(a + i), factor1); - // _mm256_mul_pd(_mm256_loadu_pd(b + i), factor2); - // } - } - template - void Update(T point, bool all) - { - Update(point); - if (parent != nullptr && all) - { - parent->Update(point, all); - } - } - PointPtr Centroid() - { - assert(cf.num); - auto c = GenericFactory::New(dim); - c->setIndex(-1); - c->setClusteringCenter(-1); - for (int i = 0; i < dim; ++i) c->setFeatureItem(cf.ls[i] / cf.num, i); - return c; - } - std::string Prefix(int d) - { - std::string prefix = ""; - while (d--) - { - prefix += "- "; - } - return prefix; - } - std::string Serialize(int d = 0) - { - std::string str = - Prefix(d) + std::to_string(index) + ":" + std::to_string(cf.num) + "\n"; - return str; - } - }; + Node(TreePtr tree, int d = 0) : tree(tree), dim(d), cf(d) {} + Node(TreePtr tree, PointPtr p) : Node(tree, p->getDimension()) { + Update(p); + } + ~Node() = default; + void RemoveChild(NodePtr child) { + const auto [first, last] = + std::ranges::remove_if(children, [&](auto &c) { return c == child; }); + children.erase(first, last); + } + bool IsLeaf() const { return children.empty(); } + void AddChild(NodePtr child) { + children.push_back(child); + child->parent = shared_from_this(); + } + void ClearParents() { + if (parent != nullptr) + parent->index = -1; + } + void Update(PointPtr point) { + cf.num += point->sgn; + for (int i = 0; i < dim; ++i) { + auto val = point->getFeatureItem(i); + cf.ls[i] += val * point->sgn; + cf.ss[i] += (val * val) * point->sgn; + } + if (cf.num == 0) { + if (tree != nullptr) + tree->Remove(shared_from_this()); + } + } + void Update(NodePtr node) { + cf.num += node->cf.num; + for (int i = 0; i < dim; ++i) { + cf.ls[i] += node->cf.ls[i]; + cf.ss[i] += node->cf.ss[i] * node->cf.ss[i]; + } + } + void Scale(double scale) { + for (int i = 0; i < dim; ++i) { + cf.ls[i] *= scale; + cf.ss[i] *= scale * scale; + } + // auto a = cf.ls.data(), b = cf.ss.data(); + // auto factor1 = _mm256_set1_pd(scale), factor2 = _mm256_set1_pd(scale * + // scale); for (size_t i = 0; i < dim; i += 4) { + // _mm256_mul_pd(_mm256_loadu_pd(a + i), factor1); + // _mm256_mul_pd(_mm256_loadu_pd(b + i), factor2); + // } + } + template void Update(T point, bool all) { + Update(point); + if (parent != nullptr && all) { + parent->Update(point, all); + } + } + PointPtr Centroid() { + assert(cf.num); + auto c = GenericFactory::New(dim); + c->setIndex(-1); + c->setClusteringCenter(-1); + for (int i = 0; i < dim; ++i) + c->setFeatureItem(cf.ls[i] / cf.num, i); + return c; + } + std::string Prefix(int d) { + std::string prefix = ""; + while (d--) { + prefix += "- "; + } + return prefix; + } + std::string Serialize(int d = 0) { + std::string str = Prefix(d) + std::to_string(index) + ":" + + std::to_string(cf.num) + "\n"; + return str; + } + }; }; -class ClusteringFeaturesList -{ +class ClusteringFeaturesList { private: - const int dim; - const double distance_threshold; + const int dim; + const double distance_threshold; public: - struct Node; - using NodePtr = std::shared_ptr; - using ListPtr = std::shared_ptr; - ClusteringFeaturesList(const SesameParam ¶m); - ~ClusteringFeaturesList(); - NodePtr Insert(PointPtr point); - NodePtr Insert(NodePtr node); - void ForEach(std::function func) { std::ranges::for_each(clusters_, func); } - void Init() {} - std::vector &clusters(); - void Remove(NodePtr node); + struct Node; + using NodePtr = std::shared_ptr; + using ListPtr = std::shared_ptr; + ClusteringFeaturesList(const SesameParam ¶m); + ~ClusteringFeaturesList(); + NodePtr Insert(PointPtr point); + NodePtr Insert(NodePtr node); + void ForEach(std::function func) { + std::ranges::for_each(clusters_, func); + } + void Init() {} + std::vector &clusters(); + void Remove(NodePtr node); private: - std::vector clusters_; + std::vector clusters_; public: - struct Node : std::enable_shared_from_this - { - size_t timestamp = 0; - int index = 0; - const int dim; - ClusteringFeatures cf; + struct Node : std::enable_shared_from_this { + size_t timestamp = 0; + int index = 0; + const int dim; + ClusteringFeatures cf; - Node(int d = 0) : dim(d), cf(d){}; - Node(PointPtr p) : Node(p->getDimension()) { Update(p); } - Node(ListPtr l, PointPtr p) : Node(p) {} - ~Node() = default; - void Update(PointPtr point) - { - cf.num += point->sgn; - for (int i = 0; i < dim; ++i) - { - auto val = point->getFeatureItem(i); - cf.ls[i] += val; - cf.ss[i] += val * val; - } - } - void Update(NodePtr node) - { - cf.num += node->cf.num; - for (int i = 0; i < dim; ++i) - { - cf.ls[i] += node->cf.ls[i]; - cf.ss[i] += node->cf.ss[i] * node->cf.ss[i]; - } - } - template - void Update(T point, bool all) - { - Update(point); - } - void Scale(double scale) - { - for (int i = 0; i < dim; ++i) - { - cf.ls[i] *= scale; - cf.ss[i] *= scale * scale; - } - // auto a = cf.ls.data(), b = cf.ss.data(); - // auto factor1 = _mm256_set1_pd(scale), factor2 = _mm256_set1_pd(scale * - // scale); for (size_t i = 0; i < dim; i += 4) { - // _mm256_mul_pd(_mm256_loadu_pd(a + i), factor1); - // _mm256_mul_pd(_mm256_loadu_pd(b + i), factor2); - // } - } - PointPtr Centroid() - { - auto c = GenericFactory::New(dim); - c->setIndex(-1); - c->setClusteringCenter(-1); - for (int i = 0; i < dim; ++i) c->setFeatureItem(cf.ls[i] / cf.num, i); - return c; - } - }; + Node(int d = 0) : dim(d), cf(d) {} + Node(PointPtr p) : Node(p->getDimension()) { Update(p); } + Node(ListPtr l, PointPtr p) : Node(p) {} + ~Node() = default; + void Update(PointPtr point) { + cf.num += point->sgn; + for (int i = 0; i < dim; ++i) { + auto val = point->getFeatureItem(i); + cf.ls[i] += val; + cf.ss[i] += val * val; + } + } + void Update(NodePtr node) { + cf.num += node->cf.num; + for (int i = 0; i < dim; ++i) { + cf.ls[i] += node->cf.ls[i]; + cf.ss[i] += node->cf.ss[i] * node->cf.ss[i]; + } + } + template void Update(T point, bool all) { Update(point); } + void Scale(double scale) { + for (int i = 0; i < dim; ++i) { + cf.ls[i] *= scale; + cf.ss[i] *= scale * scale; + } + // auto a = cf.ls.data(), b = cf.ss.data(); + // auto factor1 = _mm256_set1_pd(scale), factor2 = _mm256_set1_pd(scale * + // scale); for (size_t i = 0; i < dim; i += 4) { + // _mm256_mul_pd(_mm256_loadu_pd(a + i), factor1); + // _mm256_mul_pd(_mm256_loadu_pd(b + i), factor2); + // } + } + PointPtr Centroid() { + auto c = GenericFactory::New(dim); + c->setIndex(-1); + c->setClusteringCenter(-1); + for (int i = 0; i < dim; ++i) + c->setFeatureItem(cf.ls[i] / cf.num, i); + return c; + } + }; }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CFTREE_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CFTREE_HPP_ diff --git a/include/Algorithm/DataStructure/Cache.hpp b/include/Algorithm/DataStructure/Cache.hpp index 6433399e..2906cbd5 100644 --- a/include/Algorithm/DataStructure/Cache.hpp +++ b/include/Algorithm/DataStructure/Cache.hpp @@ -12,51 +12,49 @@ #include #include -namespace SESAME -{ +namespace SESAME { // define the share point of the class object class Cache; typedef std::shared_ptr CachePtr; -class Cache -{ +class Cache { private: - int num; - int size; - double a; - double lamd; - double r; - std::vector buffer; - std::vector clus; - int pnum; + int num; + int size; + double a; + double lamd; + double r; + std::vector buffer; + std::vector clus; + int pnum; public: - Cache(); - ~Cache(); - int GetNum(); - void SetNum(int num); - int GetSize(); - void SetSize(int size); - double GetA(); - void SetA(double a); - double GetLamd(); - void SetLamd(double lamd); - double GetR(); - void SetR(double r); - std::vector &GetBuffer(); - void SetBuffer(std::vector &buffer); - std::vector &GetClus(); - void SetClus(std::vector &clus); - int GetPnum(); - void SetPnum(int pnum); - Cache(int num, double a, double lamd, double r); - DPNodePtr add(PointPtr &p, double startTime); - bool isFull(); - void compDeltaRho(double time); - void getDPTree(double minRho, double minDelta, DPTreePtr &dpTree, OutPtr &outs, - std::unordered_set &clusters); - // void outputBuffer(String outpath, double minRho) + Cache(); + ~Cache(); + int GetNum(); + void SetNum(int num); + int GetSize(); + void SetSize(int size); + double GetA(); + void SetA(double a); + double GetLamd(); + void SetLamd(double lamd); + double GetR(); + void SetR(double r); + std::vector &GetBuffer(); + void SetBuffer(std::vector &buffer); + std::vector &GetClus(); + void SetClus(std::vector &clus); + int GetPnum(); + void SetPnum(int pnum); + Cache(int num, double a, double lamd, double r); + DPNodePtr add(PointPtr &p, double startTime); + bool isFull(); + void compDeltaRho(double time); + void getDPTree(double minRho, double minDelta, DPTreePtr &dpTree, + OutPtr &outs, std::unordered_set &clusters); + // void outputBuffer(String outpath, double minRho) }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CACHE_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CACHE_HPP_ diff --git a/include/Algorithm/DataStructure/CharacteristicVector.hpp b/include/Algorithm/DataStructure/CharacteristicVector.hpp index bd7bd12b..61cebd9f 100644 --- a/include/Algorithm/DataStructure/CharacteristicVector.hpp +++ b/include/Algorithm/DataStructure/CharacteristicVector.hpp @@ -7,90 +7,83 @@ #include #include #include -namespace SESAME -{ -enum Status -{ - NO_CLASS = -1, - SPARSE, - TRANSITIONAL, - DENSE -}; -class CharacteristicVector -{ +namespace SESAME { +enum Status { NO_CLASS = -1, SPARSE, TRANSITIONAL, DENSE }; +class CharacteristicVector { public: - /** - * t_g: The last time when g is updated - */ - int updateTime; + /** + * t_g: The last time when g is updated + */ + int updateTime; - /** - * tm : last time when g is removed from grid_list as a sporadic grid (if ever). - */ - int removeTime; + /** + * tm : last time when g is removed from grid_list as a sporadic grid (if + * ever). + */ + int removeTime; - /** - * D: the grid density at the last update - */ - double gridDensity; + /** + * D: the grid density at the last update + */ + double gridDensity; - /** - * label: the cluster label of the grid - */ - int label; + /** + * label: the cluster label of the grid + */ + int label; - /** - * status: status = {SPORADIC, NORMAL} - */ - bool isSporadic; + /** + * status: status = {SPORADIC, NORMAL} + */ + bool isSporadic; - /** - * attribute: attribute = {SPARSE, TRANSITIONAL, DENSE} - */ - int attribute; + /** + * attribute: attribute = {SPARSE, TRANSITIONAL, DENSE} + */ + int attribute; - /** - * time stamp at which the grid's density was last updated (including initial and adjust - * clustering) - */ - int densityUpdateTime; + /** + * time stamp at which the grid's density was last updated (including initial + * and adjust clustering) + */ + int densityUpdateTime; - /** - * Flag marking whether there was a change in the attribute field - * the last time the grid density was updated. - */ - bool attChange; - bool isVisited = false; + /** + * Flag marking whether there was a change in the attribute field + * the last time the grid density was updated. + */ + bool attChange; + bool isVisited = false; - CharacteristicVector(); - CharacteristicVector(int updateTime, int removeTime, double Density, int label, bool isSporadic, - double dl, double dm); - double getCurrGridDensity(int NowTime, double lambda); - double getCurrGridDensity(); - bool isSparse(double dl); - bool isDense(double dm); - bool isTransitional(double dm, double dl); - /** - * Implements the density update function given in - * eq 5 (Proposition 3.1) of Chen and Tu 2007. - * - * @param currTime the data stream's current internal time - * @param decayFactor the value of lambda - */ - void densityWithNew(int NowTime, double decayFactor); - void densityWithNew(int NowTime); - /** - * Implements the update the density of all grids step given at line 2 of - * both Fig 3 and Fig 4 of Chen and Tu 2007. - * - * @param currTime the data stream's current internal time - * @param decayFactor the value of lambda - * @param dl the threshold for sparse grids - * @param dm the threshold for dense grids - */ - void UpdateAllDensity(int NowTime, double decayFactor, double dl, double dm); - void UpdateAllDensity(int NowTime, double dl, double dm); - void ChangeAttribute(double dl, double dm); + CharacteristicVector(); + CharacteristicVector(int updateTime, int removeTime, double Density, + int label, bool isSporadic, double dl, double dm); + double getCurrGridDensity(int NowTime, double lambda); + double getCurrGridDensity(); + bool isSparse(double dl); + bool isDense(double dm); + bool isTransitional(double dm, double dl); + /** + * Implements the density update function given in + * eq 5 (Proposition 3.1) of Chen and Tu 2007. + * + * @param currTime the data stream's current internal time + * @param decayFactor the value of lambda + */ + void densityWithNew(int NowTime, double decayFactor); + void densityWithNew(int NowTime); + /** + * Implements the update the density of all grids step given at line 2 of + * both Fig 3 and Fig 4 of Chen and Tu 2007. + * + * @param currTime the data stream's current internal time + * @param decayFactor the value of lambda + * @param dl the threshold for sparse grids + * @param dm the threshold for dense grids + */ + void UpdateAllDensity(int NowTime, double decayFactor, double dl, double dm); + void UpdateAllDensity(int NowTime, double dl, double dm); + void ChangeAttribute(double dl, double dm); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CHARACTERISTICVECTOR_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CHARACTERISTICVECTOR_HPP_ diff --git a/include/Algorithm/DataStructure/CoresetTree.hpp b/include/Algorithm/DataStructure/CoresetTree.hpp index b35dc818..88501b2e 100644 --- a/include/Algorithm/DataStructure/CoresetTree.hpp +++ b/include/Algorithm/DataStructure/CoresetTree.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 19/07/2021. @@ -15,98 +16,93 @@ #include #include -namespace SESAME -{ +namespace SESAME { -class CoresetTree : public std::enable_shared_from_this -{ +class CoresetTree : public std::enable_shared_from_this { public: - struct Node; - struct Bucket; - using NodePtr = std::shared_ptr; - using TreePtr = std::shared_ptr; - using Points = std::shared_ptr>; + struct Node; + struct Bucket; + using NodePtr = std::shared_ptr; + using TreePtr = std::shared_ptr; + using Points = std::shared_ptr>; private: - const SesameParam ¶m; - std::vector samples; - bool has_sampled = false; - Random r; - NodePtr root = nullptr; - std::vector buckets; - size_t num_buckets = 0; - std::vector centers; - std::vector clusters_; - std::vector Union(const std::vector &a, const std::vector &b); - NodePtr Select(NodePtr); - PointPtr ChooseCenter(NodePtr); - void Split(NodePtr, PointPtr, int); - std::vector Points2Nodes(Points); + const SesameParam ¶m; + std::vector samples; + bool has_sampled = false; + Random r; + NodePtr root = nullptr; + std::vector buckets; + size_t num_buckets = 0; + std::vector centers; + std::vector clusters_; + std::vector Union(const std::vector &a, + const std::vector &b); + NodePtr Select(NodePtr); + PointPtr ChooseCenter(NodePtr); + void Split(NodePtr, PointPtr, int); + std::vector Points2Nodes(Points); public: - CoresetTree(const SesameParam ¶m); - void Init(); - NodePtr Insert(PointPtr input); - NodePtr Insert(NodePtr node); - void Remove(NodePtr node); - std::vector &clusters(); + CoresetTree(const SesameParam ¶m); + void Init(); + NodePtr Insert(PointPtr input); + NodePtr Insert(NodePtr node); + void Remove(NodePtr node); + std::vector &clusters(); public: - struct Bucket - { - Points base, spill; - Bucket() - : base(std::make_shared>()), - spill(std::make_shared>()) - {} - }; - struct Node : std::enable_shared_from_this - { - size_t timestamp = 0; - int index = 0; - const int dim; - ClusteringFeatures cf; - double costs_sum_dist = 0.0, costs_sum_sq_dist = 0.0; - NodePtr lc = nullptr, rc = nullptr, parent = nullptr; - PointPtr center; - std::vector points; - Node(TreePtr s, PointPtr p) : dim(p->getDimension()), cf(p->getDimension()) { Update(p); } - Node(PointPtr p) : dim(p->getDimension()), cf(p->dim), center(p) {} - PointPtr Centroid() - { - auto c = GenericFactory::New(dim); - for (int i = 0; i < dim; ++i) c->setFeatureItem(cf.ls[i] / cf.num, i); - return c; - } - PointPtr Center() { return center; } - void Update(PointPtr point) - { - cf.num += point->sgn; - double d = point->L2Dist(Centroid()); - costs_sum_dist += d * point->sgn; - costs_sum_sq_dist += d * d * point->sgn; - for (int i = 0; i < dim; ++i) - { - auto val = point->getFeatureItem(i); - cf.ls[i] += val * point->sgn; - cf.ss[i] += (val * val) * point->sgn; - } - points.push_back(point); - } - void Scale(double scale) - { - costs_sum_dist *= scale; - costs_sum_sq_dist *= scale * scale; - for (int i = 0; i < dim; ++i) - { - cf.ls[i] *= scale; - cf.ss[i] *= scale * scale; - } - } - bool IsLeaf() { return lc == nullptr && rc == nullptr; } - }; + struct Bucket { + Points base, spill; + Bucket() + : base(std::make_shared>()), + spill(std::make_shared>()) {} + }; + struct Node : std::enable_shared_from_this { + size_t timestamp = 0; + int index = 0; + const int dim; + ClusteringFeatures cf; + double costs_sum_dist = 0.0, costs_sum_sq_dist = 0.0; + NodePtr lc = nullptr, rc = nullptr, parent = nullptr; + PointPtr center; + std::vector points; + Node(TreePtr s, PointPtr p) + : dim(p->getDimension()), cf(p->getDimension()) { + Update(p); + } + Node(PointPtr p) : dim(p->getDimension()), cf(p->dim), center(p) {} + PointPtr Centroid() { + auto c = GenericFactory::New(dim); + for (int i = 0; i < dim; ++i) + c->setFeatureItem(cf.ls[i] / cf.num, i); + return c; + } + PointPtr Center() { return center; } + void Update(PointPtr point) { + cf.num += point->sgn; + double d = point->L2Dist(Centroid()); + costs_sum_dist += d * point->sgn; + costs_sum_sq_dist += d * d * point->sgn; + for (int i = 0; i < dim; ++i) { + auto val = point->getFeatureItem(i); + cf.ls[i] += val * point->sgn; + cf.ss[i] += (val * val) * point->sgn; + } + points.push_back(point); + } + void Scale(double scale) { + costs_sum_dist *= scale; + costs_sum_sq_dist *= scale * scale; + for (int i = 0; i < dim; ++i) { + cf.ls[i] *= scale; + cf.ss[i] *= scale * scale; + } + } + bool IsLeaf() { return lc == nullptr && rc == nullptr; } + }; }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CORESETTREE_HPP_ +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_CORESETTREE_HPP_ diff --git a/include/Algorithm/DataStructure/DPNode.hpp b/include/Algorithm/DataStructure/DPNode.hpp index 126e063a..2f257a8d 100644 --- a/include/Algorithm/DataStructure/DPNode.hpp +++ b/include/Algorithm/DataStructure/DPNode.hpp @@ -9,92 +9,89 @@ #include #include #include -namespace SESAME -{ +namespace SESAME { class DPNode; class Cluster; typedef std::shared_ptr DPNodePtr; typedef std::shared_ptr ClusterPtr; -class Cluster -{ +class Cluster { private: - int label; - std::unordered_set cells; + int label; + std::unordered_set cells; public: - explicit Cluster(); - explicit Cluster(int label); - void add(DPNodePtr &node); - void remove(DPNodePtr &node); - int GetLabel(); - void SetLabel(int label); - [[nodiscard]] std::unordered_set &GetCells(); - void SetCells(std::unordered_set &cells); + explicit Cluster(); + explicit Cluster(int label); + void add(DPNodePtr &node); + void remove(DPNodePtr &node); + int GetLabel(); + void SetLabel(int label); + [[nodiscard]] std::unordered_set &GetCells(); + void SetCells(std::unordered_set &cells); }; -class DPNode -{ +class DPNode { private: - int cid; - int Cid; - int num; - double rho; - double delta; - SESAME::DPNodePtr dep; // TODO: father - SESAME::PointPtr center; - double lastTime; - bool active; - std::unordered_set sucs; // TODO: children - SESAME::ClusterPtr cluster; - // public double sumDelta; - // public int sucNum; - double inactiveTime; + int cid; + int Cid; + int num; + double rho; + double delta; + SESAME::DPNodePtr dep; // TODO: father + SESAME::PointPtr center; + double lastTime; + bool active; + std::unordered_set sucs; // TODO: children + SESAME::ClusterPtr cluster; + // public double sumDelta; + // public int sucNum; + double inactiveTime; - /** - * we will use dis to quickly update the delta of CluCell - */ - double dis; + /** + * we will use dis to quickly update the delta of CluCell + */ + double dis; public: - DPNode(); - ~DPNode(); - DPNode(SESAME::PointPtr &p, double time); - [[nodiscard]] int GetId(); - void SetId(int id); - [[nodiscard]] int GetCId(); - void SetCId(int Cid); - [[nodiscard]] int GetNum(); - void SetNum(int num); - [[nodiscard]] double GetRho(); - void SetRho(double rho); - [[nodiscard]] double GetDelta(); - void SetDelta(double delta); - [[nodiscard]] DPNodePtr &GetDep(); - void SetDep(DPNodePtr &dep); - [[nodiscard]] PointPtr &GetCenter(); - void SetCenter(PointPtr ¢er); - [[nodiscard]] double GetLastTime(); - void SetLastTime(double last_time); - [[nodiscard]] bool IsActive(); - void SetActive(bool active); - [[nodiscard]] std::unordered_set &GetSucs(); - void SetSucs(std::unordered_set &sucs); - [[nodiscard]] ClusterPtr &GetCluster(); - void SetCluster(SESAME::ClusterPtr &cluster); - [[nodiscard]] double GetInactiveTime(); - void SetInactiveTime(double inactive_time); - [[nodiscard]] double GetDis(); - void SetDis(double dis); - SESAME::DPNodePtr copy(); + DPNode(); + ~DPNode(); + DPNode(SESAME::PointPtr &p, double time); + [[nodiscard]] int GetId(); + void SetId(int id); + [[nodiscard]] int GetCId(); + void SetCId(int Cid); + [[nodiscard]] int GetNum(); + void SetNum(int num); + [[nodiscard]] double GetRho(); + void SetRho(double rho); + [[nodiscard]] double GetDelta(); + void SetDelta(double delta); + [[nodiscard]] DPNodePtr &GetDep(); + void SetDep(DPNodePtr &dep); + [[nodiscard]] PointPtr &GetCenter(); + void SetCenter(PointPtr ¢er); + [[nodiscard]] double GetLastTime(); + void SetLastTime(double last_time); + [[nodiscard]] bool IsActive(); + void SetActive(bool active); + [[nodiscard]] std::unordered_set &GetSucs(); + void SetSucs(std::unordered_set &sucs); + [[nodiscard]] ClusterPtr &GetCluster(); + void SetCluster(SESAME::ClusterPtr &cluster); + [[nodiscard]] double GetInactiveTime(); + void SetInactiveTime(double inactive_time); + [[nodiscard]] double GetDis(); + void SetDis(double dis); + SESAME::DPNodePtr copy(); - void insert(double startTime); - void add(double coef, double startTime); - double getDisTo(SESAME::DPNodePtr &node); - void removeSuccessor(SESAME::DPNodePtr &node); - bool hasSuccessor(); + void insert(double startTime); + void add(double coef, double startTime); + double getDisTo(SESAME::DPNodePtr &node); + void removeSuccessor(SESAME::DPNodePtr &node); + bool hasSuccessor(); - void addSuccessor(DPNodePtr &node); + void addSuccessor(DPNodePtr &node); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DPNODE_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DPNODE_HPP_ diff --git a/include/Algorithm/DataStructure/DPTree.hpp b/include/Algorithm/DataStructure/DPTree.hpp index c9e80aa4..3a15f701 100644 --- a/include/Algorithm/DataStructure/DPTree.hpp +++ b/include/Algorithm/DataStructure/DPTree.hpp @@ -11,75 +11,74 @@ #include #include #include -namespace SESAME -{ +namespace SESAME { // define the share point of the class object class DPTree; typedef std::shared_ptr DPTreePtr; -class DPTree -{ +class DPTree { private: - double lastTime; - int size; - int num; - std::vector Clus; + double lastTime; + int size; + int num; + std::vector Clus; - double a; - double lamd; - double CluR; + double a; + double lamd; + double CluR; - int cluLabel; + int cluLabel; public: - double GetLastTime(); - void SetLastTime(double last_time); - int GetSize(); - void SetSize(int size); - int GetNum(); - void SetNum(int num); - std::vector &GetClus(); - void SetClus(std::vector &clus); - double GetA(); - void SetA(double a); - double GetLamd(); - void SetLamd(double lamd); - double GetCluR(); - void SetCluR(double clu_r); - int GetCluLabel(); - void SetCluLabel(int clu_label); - double GetMinDelta(); - void SetMinDelta(double min_delta); + double GetLastTime(); + void SetLastTime(double last_time); + int GetSize(); + void SetSize(int size); + int GetNum(); + void SetNum(int num); + std::vector &GetClus(); + void SetClus(std::vector &clus); + double GetA(); + void SetA(double a); + double GetLamd(); + void SetLamd(double lamd); + double GetCluR(); + void SetCluR(double clu_r); + int GetCluLabel(); + void SetCluLabel(int clu_label); + double GetMinDelta(); + void SetMinDelta(double min_delta); private: - double minDelta; + double minDelta; public: - DPTree(); - ~DPTree(); - DPTree(int num, double CluR); - void insert(SESAME::DPNodePtr &cc, int opt); - void Init(std::vector &clus, int size, double minRho, double minDelta, - SESAME::OutPtr &outs, std::unordered_set &clusters); - SESAME::DPNodePtr findNN(PointPtr p, double coef, int opt, double time); - void adjustNoDelta(int index); - void adjustNoOpt(int index); - void computeDeltaNoOpt(int index); - void adjustOpt1(int index); - void computeDeltaF1(int index); - void adjust(int index); - void computeHeadDelta(); - void computeDelta(int index); - void deleteInact(SESAME::OutPtr &outres, double minRho, double time); - double computeAlpha(double minDelta); - double adjustMinDelta(double alpha); - // double djustMinDelta(double alpha, double minDelta); - double fun(double alpha, double upavg, double downavg, double avg); - // bool check(SESAME::OutPtr outres); - // void check(std::vector clusters); - void adjustCluster(std::unordered_set &clusters); + DPTree(); + ~DPTree(); + DPTree(int num, double CluR); + void insert(SESAME::DPNodePtr &cc, int opt); + void Init(std::vector &clus, int size, double minRho, + double minDelta, SESAME::OutPtr &outs, + std::unordered_set &clusters); + SESAME::DPNodePtr findNN(PointPtr p, double coef, int opt, double time); + void adjustNoDelta(int index); + void adjustNoOpt(int index); + void computeDeltaNoOpt(int index); + void adjustOpt1(int index); + void computeDeltaF1(int index); + void adjust(int index); + void computeHeadDelta(); + void computeDelta(int index); + void deleteInact(SESAME::OutPtr &outres, double minRho, double time); + double computeAlpha(double minDelta); + double adjustMinDelta(double alpha); + // double djustMinDelta(double alpha, double minDelta); + double fun(double alpha, double upavg, double downavg, double avg); + // bool check(SESAME::OutPtr outres); + // void check(std::vector clusters); + void adjustCluster(std::unordered_set &clusters); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DPTREE_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DPTREE_HPP_ diff --git a/include/Algorithm/DataStructure/DataStructureFactory.hpp b/include/Algorithm/DataStructure/DataStructureFactory.hpp index fc3e2129..400c9398 100644 --- a/include/Algorithm/DataStructure/DataStructureFactory.hpp +++ b/include/Algorithm/DataStructure/DataStructureFactory.hpp @@ -23,41 +23,37 @@ #include -namespace SESAME -{ +namespace SESAME { -class DataStructureFactory -{ +class DataStructureFactory { public: - static PointPtr createPoint(int dim); - static PointPtr createPoint(int index, double weight, int dim, double cost, int timestamp); - static PointPtr createPoint(int index, double weight, int dim, double cost); - static void clearPoint(PointPtr point); - - static TreeNodePtr createTreeNode(); - static void clearTreeNode(TreeNodePtr treeNode); - static MicroClusterPtr createMicroCluster(int dim, int id); - static MicroClusterPtr createMicroCluster(int dim, int id, PointPtr dataPoint, double radius); - static void clearMicroCluster(MicroClusterPtr microCluster); - static SnapshotPtr createSnapshot(MicroClusters &otherMicroClusters, int elapsedTime); - static void clearSnapshot(SnapshotPtr snapshot); - static CFTreePtr createCFTree(); - static NodePtr createNode(); - // EDMStream - static DPTreePtr createDPTree(int num, double r); - static DPNodePtr createDPNode(); - static DPNodePtr createDPNode(SESAME::PointPtr p, double time); - static CachePtr creatCache(); - static CachePtr creatCache(int num, double a, double lamd, double r); - static OutPtr createOutlierReservoir(); - static OutPtr createOutlierReservoir(double r, double a, double lamd); - - static MicroClusterPairPtr createMicroClusterPair(MicroClusterPtr microCluster1, - MicroClusterPtr microCluster2); - static void clearMicroClusterPair(MicroClusterPairPtr microClusterPair); - static AdjustedWeightPtr createAdjustedWeight(double weight, int pointTime, - timespec pointTime0); - static void clearAdjustedWeight(AdjustedWeightPtr adjustedWeight); + static TreeNodePtr createTreeNode(); + static void clearTreeNode(TreeNodePtr treeNode); + static MicroClusterPtr createMicroCluster(int dim, int id); + static MicroClusterPtr createMicroCluster(int dim, int id, PointPtr dataPoint, + double radius); + static void clearMicroCluster(MicroClusterPtr microCluster); + static SnapshotPtr createSnapshot(MicroClusters &otherMicroClusters, + int elapsedTime); + static void clearSnapshot(SnapshotPtr snapshot); + static CFTreePtr createCFTree(); + static NodePtr createNode(); + // EDMStream + static DPTreePtr createDPTree(int num, double r); + static DPNodePtr createDPNode(); + static DPNodePtr createDPNode(SESAME::PointPtr p, double time); + static CachePtr creatCache(); + static CachePtr creatCache(int num, double a, double lamd, double r); + static OutPtr createOutlierReservoir(); + static OutPtr createOutlierReservoir(double r, double a, double lamd); + + static MicroClusterPairPtr + createMicroClusterPair(MicroClusterPtr microCluster1, + MicroClusterPtr microCluster2); + static void clearMicroClusterPair(MicroClusterPairPtr microClusterPair); + static AdjustedWeightPtr createAdjustedWeight(double weight, int pointTime, + timespec pointTime0); + static void clearAdjustedWeight(AdjustedWeightPtr adjustedWeight); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DATASTRUCTUREFACTORY_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DATASTRUCTUREFACTORY_HPP_ diff --git a/include/Algorithm/DataStructure/DensityGrid.hpp b/include/Algorithm/DataStructure/DensityGrid.hpp index d3d5477a..7adc0447 100644 --- a/include/Algorithm/DataStructure/DensityGrid.hpp +++ b/include/Algorithm/DataStructure/DensityGrid.hpp @@ -10,92 +10,90 @@ #include #include -namespace SESAME -{ +namespace SESAME { class DensityGrid; typedef std::shared_ptr DensityGridPtr; -class DensityGrid -{ +class DensityGrid { public: - /** - * For each dim, its space Si, i =1, ··· ,d is divided into pi partitions as - * Si = Si,1 U Si,2 U ··· U Si,pi - * A density grid g that is composed of S1,j1 ×S2,j2 ···×Sd,jd , ji =1, ...,pi, - * has coordinates (j1,j2, ··· ,jd). - */ - std::vector coordinates; - /** - * The value of 'd' for the d-dimal space S considered by D-Stream. - */ - int dims; + /** + * For each dim, its space Si, i =1, ··· ,d is divided into pi partitions as + * Si = Si,1 U Si,2 U ··· U Si,pi + * A density grid g that is composed of S1,j1 ×S2,j2 ···×Sd,jd , ji =1, + * ...,pi, has coordinates (j1,j2, ··· ,jd). + */ + std::vector coordinates; + /** + * The value of 'd' for the d-dimal space S considered by D-Stream. + */ + int dims; - /** - * Flag denoting whether this density grid has been inspected during the adjustClustering() - * step of D-Stream. - */ - bool isVisited; - /** - * A constructor method for a density grid - * - * @param c the coordinates of the density grid - */ - DensityGrid(); - DensityGrid(const std::vector &coordin); + /** + * Flag denoting whether this density grid has been inspected during the + * adjustClustering() step of D-Stream. + */ + bool isVisited; + /** + * A constructor method for a density grid + * + * @param c the coordinates of the density grid + */ + DensityGrid(); + DensityGrid(const std::vector &coordin); - /** - * A constructor method for a density grid - * - * @param dg the density grid to copy - */ - DensityGrid(DensityGrid const &grid); - /** - * Generates a vector of neighbours for this density grid by varying each coordinate - * by one in either direction. Does not test whether the generated neighbours are valid as - * DensityGrid is not aware of the number of partitions in each dim. - * - * @return a vector of neighbours for this density grid - */ - std::vector getNeighbours() const; + /** + * A constructor method for a density grid + * + * @param dg the density grid to copy + */ + DensityGrid(DensityGrid const &grid); + /** + * Generates a vector of neighbours for this density grid by varying each + * coordinate by one in either direction. Does not test whether the generated + * neighbours are valid as DensityGrid is not aware of the number of + * partitions in each dim. + * + * @return a vector of neighbours for this density grid + */ + std::vector getNeighbours() const; - /** - * Provides the probability of the argument instance belonging to the density grid in question. - * - * @return 1.0 if the instance equals the density grid's coordinates; 0.0 otherwise. - */ + /** + * Provides the probability of the argument instance belonging to the density + * grid in question. + * + * @return 1.0 if the instance equals the density grid's coordinates; 0.0 + * otherwise. + */ - double getInclusionProbability(Point point); + double getInclusionProbability(Point point); - bool operator==(DensityGrid &gridOther) const; + bool operator==(DensityGrid &gridOther) const; }; -struct GridKeyHash -{ - std::size_t operator()(const DensityGrid &densityGrid) const - { - // int[] primes = {31, 37, 41, 43, 47, 53, 59}; - int hc = 1; - for (int i = 0; i < densityGrid.dims; i++) - { - hc = (hc * 31) + densityGrid.coordinates[i]; - } - - return hc; +struct GridKeyHash { + std::size_t operator()(const DensityGrid &densityGrid) const { + // int[] primes = {31, 37, 41, 43, 47, 53, 59}; + int hc = 1; + for (int i = 0; i < densityGrid.dims; i++) { + hc = (hc * 31) + densityGrid.coordinates[i]; } + + return hc; + } }; -struct EqualGrid -{ - bool operator()(const DensityGrid &densityGrid1, const DensityGrid &densityGrid2) const - { - if (densityGrid1.dims != densityGrid2.dims) return false; - for (int i = 0; i < densityGrid1.dims; i++) - { - if (densityGrid1.coordinates[i] != densityGrid2.coordinates[i]) return false; - } - return true; +struct EqualGrid { + bool operator()(const DensityGrid &densityGrid1, + const DensityGrid &densityGrid2) const { + if (densityGrid1.dims != densityGrid2.dims) + return false; + for (int i = 0; i < densityGrid1.dims; i++) { + if (densityGrid1.coordinates[i] != densityGrid2.coordinates[i]) + return false; } + return true; + } }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DENSITYGRID_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_DENSITYGRID_HPP_ diff --git a/include/Algorithm/DataStructure/FeatureVector.hpp b/include/Algorithm/DataStructure/FeatureVector.hpp index 582848ac..6bb2139c 100644 --- a/include/Algorithm/DataStructure/FeatureVector.hpp +++ b/include/Algorithm/DataStructure/FeatureVector.hpp @@ -11,106 +11,91 @@ #include #include -namespace SESAME -{ +namespace SESAME { class CF; typedef std::shared_ptr CFPtr; -class CF -{ +class CF { private: - // N是子类中节点的数目,LS是N个节点的线性和,SS是N个节点的平方和 - int NumberOfNodes; - int index; + // N是子类中节点的数目,LS是N个节点的线性和,SS是N个节点的平方和 + int NumberOfNodes; + int index; public: - std::vector LS; - std::vector SS; - CF(); - ~CF(); - int getN() const; - void setN(int n); - std::vector getLS() const; - std::vector getSS() const; - double getLSItem(int index) const; - double getSSItem(int index) const; - void setLS(std::vector &newLs); - void setSS(std::vector &newSs); - SESAME::CFPtr copy(); - int getIndex(); - void setIndex(int id); + std::vector LS; + std::vector SS; + CF(); + ~CF(); + int getN() const; + void setN(int n); + std::vector getLS() const; + std::vector getSS() const; + double getLSItem(int index) const; + double getSSItem(int index) const; + void setLS(std::vector &newLs); + void setSS(std::vector &newSs); + SESAME::CFPtr copy(); + int getIndex(); + void setIndex(int id); }; template -concept NodeConcept = requires(T t) -{ - t->Centroid(); - t->cf.num; - t->index; - t->Update(GenericFactory::New(0)); - t->Scale(1.0); +concept NodeConcept = requires(T t) { + t->Centroid(); + t->cf.num; + t->index; + t->Update(GenericFactory::New(0)); + t->Scale(1.0); }; template -std::vector> CalcAdjMatrix(const std::vector &nodes) -{ - const int n = nodes.size(); - std::vector> adjMatrix(n, std::vector(n, 0.0)); - std::vector centroids(n); - for (int i = 0; i < n; ++i) - { - centroids[i] = nodes[i]->Centroid(); +std::vector> CalcAdjMatrix(const std::vector &nodes) { + const int n = nodes.size(); + std::vector> adjMatrix(n, std::vector(n, 0.0)); + std::vector centroids(n); + for (int i = 0; i < n; ++i) { + centroids[i] = nodes[i]->Centroid(); + } + for (int i = 0; i < n; i++) { + for (int j = i + 1; j < n; j++) { + auto distance = centroids[i]->L1Dist(centroids[j]); + adjMatrix[i][j] = distance, adjMatrix[j][i] = distance; } - for (int i = 0; i < n; i++) - { - for (int j = i + 1; j < n; j++) - { - auto distance = centroids[i]->L1Dist(centroids[j]); - adjMatrix[i][j] = distance, adjMatrix[j][i] = distance; - } - } - return adjMatrix; + } + return adjMatrix; } template -auto CalcClosestNode(const std::vector &nodes, PointPtr point) -{ - double minDist = std::numeric_limits::max(); - T node = nullptr; - for (auto child : nodes) - { - auto centroid = child->Centroid(); - auto distance = centroid->L2Dist(point); - if (distance < minDist) - { - minDist = distance; - node = child; - } +auto CalcClosestNode(const std::vector &nodes, PointPtr point) { + double minDist = std::numeric_limits::max(); + T node = nullptr; + for (auto child : nodes) { + auto centroid = child->Centroid(); + auto distance = centroid->L2Dist(point); + if (distance < minDist) { + minDist = distance; + node = child; } - return std::make_pair(node, minDist); + } + return std::make_pair(node, minDist); } -template -double CalcClusterL1Dist(T a, T b) -{ - auto ca = a->Centroid(), cb = b->Centroid(); - return ca->L1Dist(cb); +template double CalcClusterL1Dist(T a, T b) { + auto ca = a->Centroid(), cb = b->Centroid(); + return ca->L1Dist(cb); } -template -double CalcClusterL2Dist(T a, T b) -{ - auto ca = a->Centroid(), cb = b->Centroid(); - return ca->L2Dist(cb); +template double CalcClusterL2Dist(T a, T b) { + auto ca = a->Centroid(), cb = b->Centroid(); + return ca->L2Dist(cb); } -struct ClusteringFeatures -{ - // 原CF结构体,num是子类中节点的数目,LS是N个节点的线性和,SS是N个节点的平方和 - int num = 0; - std::vector ls, ss; - ClusteringFeatures(int d = 0) : ls(std::vector(d, 0.0)), ss(std::vector(d, 0.0)) - {} +struct ClusteringFeatures { + // 原CF结构体,num是子类中节点的数目,LS是N个节点的线性和,SS是N个节点的平方和 + int num = 0; + std::vector ls, ss; + ClusteringFeatures(int d = 0) + : ls(std::vector(d, 0.0)), ss(std::vector(d, 0.0)) {} }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_FEATUREVECTOR_H_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_FEATUREVECTOR_H_ diff --git a/include/Algorithm/DataStructure/GenericFactory.hpp b/include/Algorithm/DataStructure/GenericFactory.hpp index c025a35d..30293ce0 100644 --- a/include/Algorithm/DataStructure/GenericFactory.hpp +++ b/include/Algorithm/DataStructure/GenericFactory.hpp @@ -12,19 +12,15 @@ #include -namespace SESAME -{ +namespace SESAME { -namespace GenericFactory -{ +namespace GenericFactory { -template -std::shared_ptr New(Ts &&...ts) -{ - return std::make_shared(std::forward(ts)...); +template std::shared_ptr New(Ts &&...ts) { + return std::make_shared(std::forward(ts)...); } -} // namespace GenericFactory +} // namespace GenericFactory -} // namespace SESAME +} // namespace SESAME #endif \ No newline at end of file diff --git a/include/Algorithm/DataStructure/GridCluster.hpp b/include/Algorithm/DataStructure/GridCluster.hpp index b5fff902..4954eb36 100644 --- a/include/Algorithm/DataStructure/GridCluster.hpp +++ b/include/Algorithm/DataStructure/GridCluster.hpp @@ -8,84 +8,84 @@ #include #include #include -namespace SESAME -{ +namespace SESAME { class GridCluster; typedef std::unordered_map HashGrids; -class GridCluster -{ +class GridCluster { public: - HashGrids grids; - HashGrids visited; - int clusterLabel; - // Initialize - GridCluster(int label); - GridCluster(); - GridCluster(HashGrids hashMap, int label); - /** - * @param grid the density grid to add to the cluster - */ - void addGrid(const DensityGrid& grid); + HashGrids grids; + HashGrids visited; + int clusterLabel; + // Initialize + GridCluster(int label); + GridCluster(); + GridCluster(HashGrids hashMap, int label); + /** + * @param grid the density grid to add to the cluster + */ + void addGrid(const DensityGrid &grid); - /** - * @param dg the density grid to remove from the cluster - */ - void removeGrid(const DensityGrid& grid); + /** + * @param dg the density grid to remove from the cluster + */ + void removeGrid(const DensityGrid &grid); - /** - * @param gridClus the GridCluster to be absorbed into this cluster - */ - void absorbCluster(GridCluster gridCluster); - /** - * Inside Grids are defined in Definition 3.5 of Chen and Tu 2007 as: - * Consider a grid group G and a grid g ∈ G, suppose g =(j1, ··· ,jd), if g has - * neighboring grids in every dim i =1, ·· · ,d, then g is an inside grid - * in G.Otherwise g is an outside grid in G. - * - * @param grid the density grid to label as being inside or out - * @return TRUE if g is an inside grid, FALSE otherwise - */ - bool isInside(DensityGrid grid); + /** + * @param gridClus the GridCluster to be absorbed into this cluster + */ + void absorbCluster(GridCluster gridCluster); + /** + * Inside Grids are defined in Definition 3.5 of Chen and Tu 2007 as: + * Consider a grid group G and a grid g ∈ G, suppose g =(j1, ··· ,jd), if g + * has neighboring grids in every dim i =1, ·· · ,d, then g is an inside grid + * in G.Otherwise g is an outside grid in G. + * + * @param grid the density grid to label as being inside or out + * @return TRUE if g is an inside grid, FALSE otherwise + */ + bool isInside(DensityGrid grid); - /** - * Inside Grids are defined in Definition 3.5 of Chen and Tu 2007 as: - * Consider a grid group G and a grid g ∈ G, suppose g =(j1, ··· ,jd), if g has - * neighboring grids in every dim i =1, ·· · ,d, then g is an inside grid - * in G. Otherwise g is an outside grid in G. - * - * @param grid the density grid being labelled as inside or outside - * @param other the density grid being proposed for addition - * @return TRUE if g would be an inside grid, FALSE otherwise - */ - bool isInside(DensityGrid grid, DensityGrid other); + /** + * Inside Grids are defined in Definition 3.5 of Chen and Tu 2007 as: + * Consider a grid group G and a grid g ∈ G, suppose g =(j1, ··· ,jd), if g + * has neighboring grids in every dim i =1, ·· · ,d, then g is an inside grid + * in G. Otherwise g is an outside grid in G. + * + * @param grid the density grid being labelled as inside or outside + * @param other the density grid being proposed for addition + * @return TRUE if g would be an inside grid, FALSE otherwise + */ + bool isInside(DensityGrid grid, DensityGrid other); - /** - * add a grid into grids, if exists, update value, if not, insert - */ - void putHashGrid(HashGrids grids1, const DensityGrid& g, bool inside); + /** + * add a grid into grids, if exists, update value, if not, insert + */ + void putHashGrid(HashGrids grids1, const DensityGrid &g, bool inside); - /** - * Tests a grid cluster for connectedness according to Definition 3.4, Grid Group, from - * Chen and Tu 2007. - * - * Selects one density grid in the grid cluster as a starting point and iterates repeatedly - * through its neighbours until no more density grids in the grid cluster can be visited. - * - * @return TRUE if the cluster represent one single grid group; FALSE otherwise. - */ + /** + * Tests a grid cluster for connectedness according to Definition 3.4, Grid + * Group, from Chen and Tu 2007. + * + * Selects one density grid in the grid cluster as a starting point and + * iterates repeatedly through its neighbours until no more density grids in + * the grid cluster can be visited. + * + * @return TRUE if the cluster represent one single grid group; FALSE + * otherwise. + */ - bool isConnected(); + bool isConnected(); - /** - * Iterates through the DensityGrids in the cluster and calculates the inclusion probability for - * each. - * - * @return 1.0 if instance matches any of the density grids; 0.0 otherwise. - */ - double getInclusionProb(Point point); - bool operator==(GridCluster& Other) const; + /** + * Iterates through the DensityGrids in the cluster and calculates the + * inclusion probability for each. + * + * @return 1.0 if instance matches any of the density grids; 0.0 otherwise. + */ + double getInclusionProb(Point point); + bool operator==(GridCluster &Other) const; }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_GRIDCLUSTER_HPP_ +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_GRIDCLUSTER_HPP_ diff --git a/include/Algorithm/DataStructure/MeyersonSketch.hpp b/include/Algorithm/DataStructure/MeyersonSketch.hpp index 787b0d18..9f807f59 100644 --- a/include/Algorithm/DataStructure/MeyersonSketch.hpp +++ b/include/Algorithm/DataStructure/MeyersonSketch.hpp @@ -10,81 +10,79 @@ #include #include -namespace SESAME -{ -class MeyersonSketch : public std::enable_shared_from_this -{ +namespace SESAME { +class MeyersonSketch : public std::enable_shared_from_this { public: - struct Node; - using NodePtr = std::shared_ptr; - using SktchPtr = std::shared_ptr; + struct Node; + using NodePtr = std::shared_ptr; + using SktchPtr = std::shared_ptr; private: - const SesameParam ¶m; - std::vector samples; - bool has_sampled = false; - Random r; - NodePtr Process(PointPtr); - NodePtr CreateCenter(PointPtr); - std::vector centers; - int max_sketch_size_; - double distance_denominator_; + const SesameParam ¶m; + std::vector samples; + bool has_sampled = false; + Random r; + NodePtr Process(PointPtr); + NodePtr CreateCenter(PointPtr); + std::vector centers; + int max_sketch_size_; + double distance_denominator_; public: - MeyersonSketch(const SesameParam ¶m); - void Init(); - NodePtr Insert(PointPtr input); - NodePtr Insert(NodePtr node); - void Remove(NodePtr node); - std::vector &clusters(); - void ForEach(std::function func); + MeyersonSketch(const SesameParam ¶m); + void Init(); + NodePtr Insert(PointPtr input); + NodePtr Insert(NodePtr node); + void Remove(NodePtr node); + std::vector &clusters(); + void ForEach(std::function func); public: - struct Node : std::enable_shared_from_this - { - size_t timestamp = 0; - int index = 0; - const int dim; - ClusteringFeatures cf; - double costs_sum_dist = 0.0, costs_sum_sq_dist = 0.0; - Node(SktchPtr s, PointPtr p) : dim(p->getDimension()), cf(p->getDimension()) { Update(p); } - Node(PointPtr p) : dim(p->getDimension()), cf(p->getDimension()) { Update(p); } - PointPtr Centroid() - { - auto c = GenericFactory::New(dim); - for (int i = 0; i < dim; ++i) c->setFeatureItem(cf.ls[i] / cf.num, i); - return c; - } - void Update(PointPtr point) - { - cf.num += point->sgn; - double d = point->L2Dist(Centroid()); - costs_sum_dist += d * point->sgn; - costs_sum_sq_dist += d * d * point->sgn; - for (int i = 0; i < dim; ++i) - { - auto val = point->getFeatureItem(i); - cf.ls[i] += val * point->sgn; - cf.ss[i] += (val * val) * point->sgn; - } - } - void Scale(double scale) - { - costs_sum_dist *= scale; - costs_sum_sq_dist *= scale * scale; - for (int i = 0; i < dim; ++i) - { - cf.ls[i] *= scale; - cf.ss[i] *= scale * scale; - } - } - }; + struct Node : std::enable_shared_from_this { + size_t timestamp = 0; + int index = 0; + const int dim; + ClusteringFeatures cf; + double costs_sum_dist = 0.0, costs_sum_sq_dist = 0.0; + Node(SktchPtr s, PointPtr p) + : dim(p->getDimension()), cf(p->getDimension()) { + Update(p); + } + Node(PointPtr p) : dim(p->getDimension()), cf(p->getDimension()) { + Update(p); + } + PointPtr Centroid() { + auto c = GenericFactory::New(dim); + for (int i = 0; i < dim; ++i) + c->setFeatureItem(cf.ls[i] / cf.num, i); + return c; + } + void Update(PointPtr point) { + cf.num += point->sgn; + double d = point->L2Dist(Centroid()); + costs_sum_dist += d * point->sgn; + costs_sum_sq_dist += d * d * point->sgn; + for (int i = 0; i < dim; ++i) { + auto val = point->getFeatureItem(i); + cf.ls[i] += val * point->sgn; + cf.ss[i] += (val * val) * point->sgn; + } + } + void Scale(double scale) { + costs_sum_dist *= scale; + costs_sum_sq_dist *= scale * scale; + for (int i = 0; i < dim; ++i) { + cf.ls[i] *= scale; + cf.ss[i] *= scale * scale; + } + } + }; }; -std::pair guess_optimum_range_bounds(Random *r, - const std::vector &samples, - int window_size, int num_samples, int k); +std::pair +guess_optimum_range_bounds(Random *r, const std::vector &samples, + int window_size, int num_samples, int k); -} // namespace SESAME +} // namespace SESAME #endif \ No newline at end of file diff --git a/include/Algorithm/DataStructure/MicroCluster.hpp b/include/Algorithm/DataStructure/MicroCluster.hpp index 85ebc2f6..e05a2068 100644 --- a/include/Algorithm/DataStructure/MicroCluster.hpp +++ b/include/Algorithm/DataStructure/MicroCluster.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by 1124a on 2021/8/16. @@ -12,76 +13,75 @@ #include #include #include -namespace SESAME -{ +namespace SESAME { typedef std::vector dataPoint; class MicroCluster; typedef std::shared_ptr MicroClusterPtr; -class MicroCluster -{ +class MicroCluster { public: - constexpr static const double EPSILON = 0.00005; - constexpr static const double MIN_VARIANCE = 1e-50; // TODO Need to move ... + constexpr static const double EPSILON = 0.00005; + constexpr static const double MIN_VARIANCE = 1e-50; // TODO Need to move ... - std::vector id; - dataPoint centroid; - dataPoint LS; // the sum of the data values for each dim - dataPoint SS; // the sum of the squares of the data values for each dim - int LST; // the sum of the time stamps Ti~... Tin - int SST; // the sum of the squares of the time stamps Til... Tin - double weight; // number of data point in the clusters - int dim; - double radius; // Used in DBStream - // the parameters below is unique for DenStream - int createTime; - int lastUpdateTime; - bool visited; + std::vector id; + dataPoint centroid; + dataPoint LS; // the sum of the data values for each dim + dataPoint SS; // the sum of the squares of the data values for each dim + int LST; // the sum of the time stamps Ti~... Tin + int SST; // the sum of the squares of the time stamps Til... Tin + double weight; // number of data point in the clusters + int dim; + double radius; // Used in DBStream + // the parameters below is unique for DenStream + int createTime; + int lastUpdateTime; + bool visited; - // TODO 1. Need to subtract Base class of CF vector when all cf-vector based-algorithms have - // been implemented - // 2.this may need to modify in the future (All algorithms used this, - // e.g.DenStream,CluStream,DenStream,DBStream,SWEM =.=) + // TODO 1. Need to subtract Base class of CF vector when all cf-vector + // based-algorithms have been implemented + // 2.this may need to modify in the future (All algorithms used this, + // e.g.DenStream,CluStream,DenStream,DBStream,SWEM =.=) - MicroCluster(int dim, int id); - MicroCluster(int dim, int id, PointPtr dataPoint, double radius); // DBStream + MicroCluster(int dim, int id); + MicroCluster(int dim, int id, PointPtr dataPoint, double radius); // DBStream - ~MicroCluster(); - void Init(PointPtr datapoint, int timestamp); - void insert(PointPtr datapoint, int timestamp); // Used in CluStream - bool insert(PointPtr datapoint, double decayFactor, double epsilon); // DenStream - void insert(PointPtr datapoint); // DBStream //, double decayFactor - void merge(MicroClusterPtr other); - void subtractClusterVector(MicroClusterPtr other); - void updateId(MicroClusterPtr other); + ~MicroCluster(); + void Init(PointPtr datapoint, int timestamp); + void insert(PointPtr datapoint, int timestamp); // Used in CluStream + bool insert(PointPtr datapoint, double decayFactor, + double epsilon); // DenStream + void insert(PointPtr datapoint); // DBStream //, double decayFactor + void merge(MicroClusterPtr other); + void subtractClusterVector(MicroClusterPtr other); + void updateId(MicroClusterPtr other); - void resetID(int index); // Used in DenStream - double getRadius(double decayFactor, bool judge); // Used in DenStream + void resetID(int index); // Used in DenStream + double getRadius(double decayFactor, bool judge); // Used in DenStream - double getRelevanceStamp(int num_last_arr) const; - double getMutime() const; - double getSigmaTime() const; - static double getQuantile(double z); - double getRadius(double radius); - double getDeviation(); - dataPoint getCentroid(); - PointPtr getCenter(); - double getInclusionProbability(PointPtr datapoint, double radius); - dataPoint getVarianceVector(); - double calCentroidDistance(PointPtr datapoint); - bool judgeMerge(MicroClusterPtr other); - double getDistance(PointPtr datapoint); // DBStream - double getDistance(MicroClusterPtr other); // DBStream - void move(); // DBStream - void decayWeight(double decayFactor); - SESAME::MicroClusterPtr copy(); + double getRelevanceStamp(int num_last_arr) const; + double getMutime() const; + double getSigmaTime() const; + static double getQuantile(double z); + double getRadius(double radius); + double getDeviation(); + dataPoint getCentroid(); + PointPtr getCenter(); + double getInclusionProbability(PointPtr datapoint, double radius); + dataPoint getVarianceVector(); + double calCentroidDistance(PointPtr datapoint); + bool judgeMerge(MicroClusterPtr other); + double getDistance(PointPtr datapoint); // DBStream + double getDistance(MicroClusterPtr other); // DBStream + void move(); // DBStream + void decayWeight(double decayFactor); + SESAME::MicroClusterPtr copy(); private: - double distance; - static double inverseError(double x); + double distance; + static double inverseError(double x); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_MICROCLUSTER_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_MICROCLUSTER_HPP_ diff --git a/include/Algorithm/DataStructure/OutlierReservoir.hpp b/include/Algorithm/DataStructure/OutlierReservoir.hpp index 6d8a3405..55c827b5 100644 --- a/include/Algorithm/DataStructure/OutlierReservoir.hpp +++ b/include/Algorithm/DataStructure/OutlierReservoir.hpp @@ -10,48 +10,46 @@ #include #include -namespace SESAME -{ +namespace SESAME { class OutlierReservoir; typedef std::shared_ptr OutPtr; -class OutlierReservoir -{ +class OutlierReservoir { private: - double r; - double timeGap; - long lastDelTime; + double r; + double timeGap; + long lastDelTime; - double a; - double lamd; + double a; + double lamd; - std::unordered_set outliers; + std::unordered_set outliers; public: - [[nodiscard]] double GetR(); - void SetR(double r); - [[nodiscard]] double GetTimeGap(); - void SetTimeGap(double time_gap); - [[nodiscard]] long GetLastDelTime(); - void SetLastDelTime(long last_del_time); - [[nodiscard]] double GetA(); - void SetA(double a); - [[nodiscard]] double GetLamd(); - void SetLamd(double lamd); - [[nodiscard]] std::unordered_set &getOutliers(); - void setOutliers(std::unordered_set &outliers); - OutlierReservoir(); - ~OutlierReservoir(); - OutlierReservoir(double r, double a, double lamd); - - void setTimeGap(double timeGap); - - void insert(SESAME::DPNodePtr &c); - - SESAME::DPNodePtr insert(SESAME::PointPtr &p, double time); - - void remove(SESAME::DPNodePtr &nn); + [[nodiscard]] double GetR(); + void SetR(double r); + [[nodiscard]] double GetTimeGap(); + void SetTimeGap(double time_gap); + [[nodiscard]] long GetLastDelTime(); + void SetLastDelTime(long last_del_time); + [[nodiscard]] double GetA(); + void SetA(double a); + [[nodiscard]] double GetLamd(); + void SetLamd(double lamd); + [[nodiscard]] std::unordered_set &getOutliers(); + void setOutliers(std::unordered_set &outliers); + OutlierReservoir(); + ~OutlierReservoir(); + OutlierReservoir(double r, double a, double lamd); + + void setTimeGap(double timeGap); + + void insert(SESAME::DPNodePtr &c); + + SESAME::DPNodePtr insert(SESAME::PointPtr &p, double time); + + void remove(SESAME::DPNodePtr &nn); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_OUTLIERRESERVOIR_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_OUTLIERRESERVOIR_HPP_ diff --git a/include/Algorithm/DataStructure/Point.hpp b/include/Algorithm/DataStructure/Point.hpp index 54ea9448..53330fff 100644 --- a/include/Algorithm/DataStructure/Point.hpp +++ b/include/Algorithm/DataStructure/Point.hpp @@ -8,58 +8,52 @@ #ifndef SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_POINT_HPP_ #define SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_POINT_HPP_ -#include +#include "Utils/Types.hpp" + #include #include #include -namespace SESAME -{ -class Point; +namespace SESAME { +struct Point; typedef std::shared_ptr PointPtr; -class Point -{ -public: - using clock_t = std::chrono::_V2::system_clock::time_point; - int index; // 1,2,3,4,5.... - double weight = 1; // considering the outdated effect - double cost; - double min_dist; - double knn = 0.0, conn = 1.0; - int timestamp; - bool outlier = false; - int sgn = 1; // the distance to the nearest data point - int clu_id = -1; // using index to identify - int dim; // feature Length - clock_t toa; // time of arrival - std::vector feature; // TODO: need to think how to remove * here. - Point(int dim, int index = -1, double weight = 1.0, double cost = 0.0, int timestamp = 0); - PointPtr copy(); - void setCost(double c); - double getCost() const; - int getIndex() const; - void setIndex(int index); - double getWeight() const; - void setWeight(double weight); - double getFeatureItem(int index) const; - void setFeatureItem(double feature, int index); - int getClusteringCenter() const; - void setClusteringCenter(int index); - int getDimension() const; - void setDimension(int d); - int getFeatureLength(); - double getMinDist() const; - void setMinDist(double min_dist); - void setTimeStamp(int t); - int getTimeStamp() const; - bool getOutlier(); - void setOutlier(bool flag); - double L2Dist(PointPtr centroid); - double L1Dist(PointPtr centroid); - PointPtr Reverse(); - std::string Serialize(); - void Debug(); +struct Point { + uint64 index; // 1,2,3,4,5.... + fp64 weight = 1.0; // considering the outdated effect + fp64 cost = 0.0; + fp64 min_dist; + fp64 knn = 0.0, conn = 1.0; + bool outlier = false; + int8 sgn = 1; // the distance to the nearest data point + int32 clu_id = -1; // using index to identify + uint32 dim = 0; // feature Length + clock_t toa; // time of arrival + uint64 timestamp; // the time stamp of the data point + std::vector feature; + Point(uint32 dim = 0, uint64 index = 0, feature_t *feature = nullptr); + fp64 *data() { return feature.data(); } + PointPtr copy(); + int getIndex() const; + void setIndex(int index); + double getWeight() const; + void setWeight(double weight); + double getFeatureItem(int index) const; + void setFeatureItem(double feature, int index); + int getClusteringCenter() const; + void setClusteringCenter(int index); + int getDimension() const; + int getFeatureLength(); + double getMinDist() const; + void setMinDist(double min_dist); + bool getOutlier(); + void setOutlier(bool flag); + double L2Dist(PointPtr centroid); + double L1Dist(PointPtr centroid); + PointPtr Reverse(); + std::string Serialize(); + void Debug(); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_POINT_HPP_ +} // namespace SESAME + +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_POINT_HPP_ diff --git a/include/Algorithm/DataStructure/Snapshot.hpp b/include/Algorithm/DataStructure/Snapshot.hpp index 0a684ff7..f520bf36 100644 --- a/include/Algorithm/DataStructure/Snapshot.hpp +++ b/include/Algorithm/DataStructure/Snapshot.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by 1124a on 2021/8/16. @@ -15,8 +16,7 @@ #include #include -namespace SESAME -{ +namespace SESAME { class Snapshot; typedef std::shared_ptr SnapshotPtr; @@ -24,26 +24,26 @@ typedef std::vector MicroClusters; typedef std::vector QueueSnapshotPtr; typedef std::vector QueueOrderSnapshot; -class Snapshot -{ +class Snapshot { public: - int elapsedTime; - MicroClusters microClusters; - - /** - QueueSnapshotPtr: Data Structure representing order ith snapshots list - QueueOrderSnapshotPtr: Data Structure representing orders - **/ - Snapshot(MicroClusters& otherMicroClusters, int elapsedTime); - ~Snapshot(); - static SnapshotPtr findSnapshot(QueueOrderSnapshot orderSnapShots, int landmarkTime, - int currentElapsedTime, unsigned int currentOrder); - - static SnapshotPtr substractSnapshot(SnapshotPtr snapshotCurrent, - const SnapshotPtr& snapshotLandmark, - unsigned int num_clusters); - - SnapshotPtr copy(); + int elapsedTime; + MicroClusters microClusters; + + /** + QueueSnapshotPtr: Data Structure representing order ith snapshots list + QueueOrderSnapshotPtr: Data Structure representing orders + **/ + Snapshot(MicroClusters &otherMicroClusters, int elapsedTime); + ~Snapshot(); + static SnapshotPtr findSnapshot(QueueOrderSnapshot orderSnapShots, + int landmarkTime, int currentElapsedTime, + unsigned int currentOrder); + + static SnapshotPtr substractSnapshot(SnapshotPtr snapshotCurrent, + const SnapshotPtr &snapshotLandmark, + unsigned int num_clusters); + + SnapshotPtr copy(); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_SNAPSHOT_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_SNAPSHOT_HPP_ diff --git a/include/Algorithm/DataStructure/TreeNode.hpp b/include/Algorithm/DataStructure/TreeNode.hpp index 713819f8..c1a9c3cc 100644 --- a/include/Algorithm/DataStructure/TreeNode.hpp +++ b/include/Algorithm/DataStructure/TreeNode.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 26/07/2021. @@ -11,36 +12,34 @@ #include #include -namespace SESAME -{ +namespace SESAME { class TreeNode; typedef std::shared_ptr TreeNodePtr; -class TreeNode -{ +class TreeNode { public: - // number of points in this node - int n; + // number of points in this node + int n; - // array with pointers on points - std::vector points; + // array with pointers on points + std::vector points; - // pointer on the centre of the treenode - PointPtr centre; + // pointer on the centre of the treenode + PointPtr centre; - // pointer on the left childnode - TreeNodePtr lc; + // pointer on the left childnode + TreeNodePtr lc; - // pointer on the right childnode - TreeNodePtr rc; + // pointer on the right childnode + TreeNodePtr rc; - // pointer on the parent node - TreeNodePtr parent; + // pointer on the parent node + TreeNodePtr parent; - // cost of the treenode - double cost; + // cost of the treenode + double cost; }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_TREENODE_HPP_ +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_TREENODE_HPP_ diff --git a/include/Algorithm/DataStructure/WeightedAdjacencyList.hpp b/include/Algorithm/DataStructure/WeightedAdjacencyList.hpp index d47b8804..cf75eb7b 100644 --- a/include/Algorithm/DataStructure/WeightedAdjacencyList.hpp +++ b/include/Algorithm/DataStructure/WeightedAdjacencyList.hpp @@ -13,62 +13,59 @@ #include #include -namespace SESAME -{ +namespace SESAME { struct MicroClusterPair; typedef std::shared_ptr MicroClusterPairPtr; -struct MicroClusterPair -{ - MicroClusterPtr microCluster1; - MicroClusterPtr microCluster2; - MicroClusterPair(MicroClusterPtr microCluster1, MicroClusterPtr microCluster2) - { - this->microCluster1 = microCluster1->copy(); - this->microCluster2 = microCluster2->copy(); - } - // bool operator==(const MicroClusterPair &other) const; +struct MicroClusterPair { + MicroClusterPtr microCluster1; + MicroClusterPtr microCluster2; + MicroClusterPair(MicroClusterPtr microCluster1, + MicroClusterPtr microCluster2) { + this->microCluster1 = microCluster1->copy(); + this->microCluster2 = microCluster2->copy(); + } + // bool operator==(const MicroClusterPair &other) const; }; -struct KeyHasher -{ - std::size_t operator()(const MicroClusterPair µClusterPair) const - { - return (std::hash()(microClusterPair.microCluster1->id.front())) ^ - (std::hash()(microClusterPair.microCluster2->id.front())); - } +struct KeyHasher { + std::size_t operator()(const MicroClusterPair µClusterPair) const { + return (std::hash()(microClusterPair.microCluster1->id.front())) ^ + (std::hash()(microClusterPair.microCluster2->id.front())); + } }; -struct EqualKey -{ - bool operator()(const MicroClusterPair &MCPair1, const MicroClusterPair &MCPair2) const - { - if (MCPair1.microCluster1->id.front() == MCPair2.microCluster1->id.front() && - MCPair1.microCluster2->id.front() == MCPair2.microCluster2->id.front()) - return true; - if (MCPair1.microCluster1->id.front() == MCPair2.microCluster2->id.front() && - MCPair1.microCluster2->id.front() == MCPair2.microCluster1->id.front()) - return true; - return false; - } +struct EqualKey { + bool operator()(const MicroClusterPair &MCPair1, + const MicroClusterPair &MCPair2) const { + if (MCPair1.microCluster1->id.front() == + MCPair2.microCluster1->id.front() && + MCPair1.microCluster2->id.front() == MCPair2.microCluster2->id.front()) + return true; + if (MCPair1.microCluster1->id.front() == + MCPair2.microCluster2->id.front() && + MCPair1.microCluster2->id.front() == MCPair2.microCluster1->id.front()) + return true; + return false; + } }; class AdjustedWeight; typedef std::shared_ptr AdjustedWeightPtr; -class AdjustedWeight -{ +class AdjustedWeight { public: - double weight; - int updateTime = 0; // - timespec updateTime0; - AdjustedWeight(double weight, int pointTime, - timespec pointTime0); // clock_t pointTime - void add(int startTime, double decayValue); - void add(timespec startTime, double decayValue); - double getCurrentWeight(double decayFactor); + double weight; + int updateTime = 0; // + timespec updateTime0; + AdjustedWeight(double weight, int pointTime, + timespec pointTime0); // clock_t pointTime + void add(int startTime, double decayValue); + void add(timespec startTime, double decayValue); + double getCurrentWeight(double decayFactor); }; -typedef std::unordered_map +typedef std::unordered_map WeightedAdjacencyList; typedef std::pair DensityGraph; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_WEIGHTEDADJACENCYLIST_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DATASTRUCTURE_WEIGHTEDADJACENCYLIST_HPP_ diff --git a/include/Algorithm/DenStream.hpp b/include/Algorithm/DenStream.hpp index 4f5a119b..20820dd0 100644 --- a/include/Algorithm/DenStream.hpp +++ b/include/Algorithm/DenStream.hpp @@ -16,60 +16,63 @@ #include #include -namespace SESAME -{ +namespace SESAME { #define noVisited (-1) -class DenStreamParams : public SesameParam -{ +class DenStreamParams : public SesameParam { public: - int buf_sizeSize; // number of data point for Initialization - unsigned int min_points; // minimum point of core point in DBSCAN - double epsilon; // maximum distance if point belongs to the density area of - // core point - double base; // base of decay function - double lambda; - double mu; // used to calculate minimum weight minWeight=mu*beta; - double beta; // used to calculate minimum weight + int buf_sizeSize; // number of data point for Initialization + unsigned int min_points; // minimum point of core point in DBSCAN + double epsilon; // maximum distance if point belongs to the density area of + // core point + double base; // base of decay function + double lambda; + double mu; // used to calculate minimum weight minWeight=mu*beta; + double beta; // used to calculate minimum weight }; -class DenStream : public Algorithm -{ +class DenStream : public Algorithm { public: - DenStreamParams denStreamParams; - std::shared_ptr dbscan; // used for initialization and offline re-clustering - DampedWindowPtr dampedWindow; - std::vector pMicroClusters; - std::vector oMicroClusters; - int startTime; - int pointArrivingTime; // clock_t - int lastPointTime; - int lastUpdateTime; // for calculating time interval - double Tp; - int pMicroClusterIndex; - int oMicroClusterIndex; + DenStreamParams denStreamParams; + std::shared_ptr + dbscan; // used for initialization and offline re-clustering + DampedWindowPtr dampedWindow; + std::vector pMicroClusters; + std::vector oMicroClusters; + int startTime; + int pointArrivingTime; // clock_t + int lastPointTime; + int lastUpdateTime; // for calculating time interval + double Tp; + int pMicroClusterIndex; + int oMicroClusterIndex; - DenStream(param_t &cmd_params); - ~DenStream(); - void Init() override; - void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; - double getMinWeight() { return minWeight; }; + DenStream(param_t &cmd_params); + ~DenStream(); + void Init() override; + void RunOnline(PointPtr input) override; + void RunOffline(DataSinkPtr sinkPtr) override; + double getMinWeight() { return minWeight; }; private: - bool isInitial = false; - vector initialBuffer; - double minWeight; - void Init(vector &initData); - void merge(PointPtr dataPoint); - void pointsNearCorePoint(vector &initData, std::vector pointIndex, - MicroClusterPtr microCluster); - MicroClusterPtr nearestNeighbor(PointPtr dataPoint, std::vector microClusters); - bool mergeToMicroCluster(PointPtr dataPoint, std::vector microClusters); - bool mergeToOMicroCluster(PointPtr dataPoint, std::vector microClusters); - static void microClusterToPoint(std::vector µClusters, - vector &points); - int findIndex(std::vector µClusters, MicroClusterPtr MC); - // TODO overlap functions with Clustream, may need to remove to utils folder + bool isInitial = false; + vector initialBuffer; + double minWeight; + void Init(vector &initData); + void merge(PointPtr dataPoint); + void pointsNearCorePoint(vector &initData, + std::vector pointIndex, + MicroClusterPtr microCluster); + MicroClusterPtr nearestNeighbor(PointPtr dataPoint, + std::vector microClusters); + bool mergeToMicroCluster(PointPtr dataPoint, + std::vector microClusters); + bool mergeToOMicroCluster(PointPtr dataPoint, + std::vector microClusters); + static void microClusterToPoint(std::vector µClusters, + vector &points); + int findIndex(std::vector µClusters, + MicroClusterPtr MC); + // TODO overlap functions with Clustream, may need to remove to utils folder }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DENSTREAM_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DENSTREAM_HPP_ diff --git a/include/Algorithm/DesignAspect/Generic.hpp b/include/Algorithm/DesignAspect/Generic.hpp index 8f7ea060..d67f5437 100644 --- a/include/Algorithm/DesignAspect/Generic.hpp +++ b/include/Algorithm/DesignAspect/Generic.hpp @@ -20,349 +20,294 @@ #include "Algorithm/DataStructure/CoresetTree.hpp" #include "Algorithm/DataStructure/GenericFactory.hpp" #include "Algorithm/DataStructure/Point.hpp" -#include "Algorithm/Param.hpp" #include "Algorithm/OutlierDetection/OutlierDetection.hpp" +#include "Algorithm/Param.hpp" #include "Sinks/DataSink.hpp" #include "Utils/BenchmarkUtils.hpp" -namespace SESAME -{ +namespace SESAME { template -concept StreamClusteringConcept = requires -{ - requires requires(W w, PointPtr p) - { - { - w.Add(p) - } -> std::same_as; - }; - requires requires(D d, PointPtr p) - { - { - d.Insert(p) - } -> std::same_as; - }; - requires requires(O o, PointPtr p, typename D::NodePtr n, std::vector & vn) - { - { - o.Check(p, vn) - } -> std::same_as; - { - o.Check(n) - } -> std::same_as; - }; +concept StreamClusteringConcept = requires { + requires requires(W w, PointPtr p) { + { w.Add(p) } -> std::same_as; + }; + requires requires(D d, PointPtr p) { + { d.Insert(p) } -> std::same_as; + }; + requires requires(O o, PointPtr p, typename D::NodePtr n, + std::vector &vn) { + { o.Check(p, vn) } -> std::same_as; + { o.Check(n) } -> std::same_as; + }; }; template -requires StreamClusteringConcept -class StreamClustering : public Algorithm -{ + requires StreamClusteringConcept +class StreamClustering : public Algorithm { public: - StreamClustering(const param_t ¶m); - ~StreamClustering(); - void Init(); - void Insert(PointPtr); - void RunOnline(PointPtr input); - void RunOffline(DataSinkPtr ptr); - void store(std::string output_file, int dim, std::vector results); - void OutputOnline(std::vector ¢ers) override; + StreamClustering(const param_t ¶m); + ~StreamClustering(); + void Init(); + void Insert(PointPtr); + void RunOnline(PointPtr input); + void RunOffline(DataSinkPtr ptr); + void store(std::string output_file, int dim, std::vector results); + void OutputOnline(std::vector ¢ers) override; private: - using Node = typename D::Node; - using NodePtr = typename D::NodePtr; - std::shared_ptr w; - std::shared_ptr d; - std::shared_ptr o; - std::shared_ptr r; + using Node = typename D::Node; + using NodePtr = typename D::NodePtr; + std::shared_ptr w; + std::shared_ptr d; + std::shared_ptr o; + std::shared_ptr r; - std::vector outliers_; - std::unordered_map point_map_; - std::unordered_map> node_map_; - std::vector online_centers; - size_t cluster_size_ = 0, outlier_size_ = 0; - NodePtr InsertOutliers(PointPtr point); + std::vector outliers_; + std::unordered_map point_map_; + std::unordered_map> node_map_; + std::vector online_centers; + size_t cluster_size_ = 0, outlier_size_ = 0; + NodePtr InsertOutliers(PointPtr point); }; template -StreamClustering::~StreamClustering() -{} + requires StreamClusteringConcept +StreamClustering::~StreamClustering() {} template -StreamClustering::StreamClustering(const param_t &cmd_params) -{ - param = cmd_params; + requires StreamClusteringConcept +StreamClustering::StreamClustering(const param_t &cmd_params) { + param = cmd_params; } template -void StreamClustering::Init() -{ - w = GenericFactory::New(param); - d = GenericFactory::New(param); - o = GenericFactory::New(param); - r = GenericFactory::New(param); - d->Init(); - sum_timer.Tick(); + requires StreamClusteringConcept +void StreamClustering::Init() { + w = GenericFactory::New(param); + d = GenericFactory::New(param); + o = GenericFactory::New(param); + r = GenericFactory::New(param); + d->Init(); + sum_timer.Tick(); } template -void StreamClustering::Insert(PointPtr p) -{ - d->Insert(p); + requires StreamClusteringConcept +void StreamClustering::Insert(PointPtr p) { + d->Insert(p); } template -void StreamClustering::RunOnline(PointPtr input) -{ - constexpr bool has_delete = requires(W & w) { w.Delete(); }; - constexpr bool has_update = requires(W & w, NodePtr node) { w.Update(node); }; - constexpr bool buffer_enabled = O::buffer_enabled; - constexpr bool timer_enabled = O::timer_enabled; - constexpr bool no_outlier_detection = std::is_same::value; - constexpr bool is_coreset_tree = std::is_same::value; - if (w->Add(input)) - { - NodePtr node; + requires StreamClusteringConcept +void StreamClustering::RunOnline(PointPtr input) { + constexpr bool has_delete = requires(W &w) { w.Delete(); }; + constexpr bool has_update = requires(W &w, NodePtr node) { w.Update(node); }; + constexpr bool buffer_enabled = O::buffer_enabled; + constexpr bool timer_enabled = O::timer_enabled; + constexpr bool no_outlier_detection = std::is_same::value; + constexpr bool is_coreset_tree = std::is_same::value; + if (w->Add(input)) { + NodePtr node; + out_timer.Tick(); + bool out = false; + if constexpr (!no_outlier_detection) { + out = o->Check(input, d->clusters()); + } + out_timer.Tock(); + if (out) { // outlier + out_timer.Tick(); + input->outlier = true; + if constexpr (buffer_enabled) { + node = InsertOutliers(input); out_timer.Tick(); - bool out = false; - if constexpr (!no_outlier_detection) - { - out = o->Check(input, d->clusters()); - } + out = o->Check(node); out_timer.Tock(); - if (out) - { // outlier - out_timer.Tick(); - input->outlier = true; - if constexpr (buffer_enabled) - { - node = InsertOutliers(input); - out_timer.Tick(); - out = o->Check(node); - out_timer.Tock(); - if (!out) - { - const auto [first, last] = std::ranges::remove_if( - outliers_, [&](const NodePtr &n) { return n == node; }); - outliers_.erase(first, last); - ds_timer.Tick(); - auto newNode = d->Insert(node); - ds_timer.Tock(); - if constexpr (has_delete) - { - for (auto &p : node_map_[node]) - { - p->outlier = false; - node_map_[newNode].insert(p); - point_map_[p] = newNode; - } - node_map_.erase(node); - } - node = newNode; - } - } - out_timer.Tock(); - } - else - { - ds_timer.Tick(); - node = d->Insert(input); - if (node) node->timestamp = input->index; - ds_timer.Tock(); - } - if constexpr (has_delete) - { - node_map_[node].insert(input); - point_map_[input] = node; - } - if constexpr (!is_coreset_tree) - { - if (input->index % param.time_window == 0) - { - out_timer.Tick(); - auto &cls = d->clusters(); - std::vector del; - for (auto &node : cls) - { - if (o->TimerCheck(input, node)) - { - if constexpr (buffer_enabled) - { - outliers_.push_back(node); - } - else if constexpr (has_delete) - { - for (auto &p : node_map_[node]) - { - point_map_.erase(p); - } - node_map_.erase(node); - } - del.push_back(node); - } - } - for (auto &node : del) - { - d->Remove(node); - } - out_timer.Tock(); + if (!out) { + const auto [first, last] = std::ranges::remove_if( + outliers_, [&](const NodePtr &n) { return n == node; }); + outliers_.erase(first, last); + ds_timer.Tick(); + auto newNode = d->Insert(node); + ds_timer.Tock(); + if constexpr (has_delete) { + for (auto &p : node_map_[node]) { + p->outlier = false; + node_map_[newNode].insert(p); + point_map_[p] = newNode; } + node_map_.erase(node); + } + node = newNode; } + } + out_timer.Tock(); + } else { + ds_timer.Tick(); + node = d->Insert(input); + if (node) + node->timestamp = input->index; + ds_timer.Tock(); } - else - { - if constexpr (is_coreset_tree) - { - out_timer.Tick(); - auto &cls = d->clusters(); - for (auto &node : cls) - { - if (o->TimerCheck(input, node)) - { - if constexpr (buffer_enabled) - { - outliers_.push_back(node); - } - d->Remove(node); - } - } - if constexpr (timer_enabled && buffer_enabled) - { - std::vector del; - for (auto &out : outliers_) - { - if (o->TimerCheck(input, out)) - { - del.push_back(out); - } - } - for (auto &out : del) - { - std::ranges::remove_if(outliers_, [&](const NodePtr &n) { return n == out; }); - } + if constexpr (has_delete) { + node_map_[node].insert(input); + point_map_[input] = node; + } + if constexpr (!is_coreset_tree) { + if (input->index % param.time_window == 0) { + out_timer.Tick(); + auto &cls = d->clusters(); + std::vector del; + for (auto &node : cls) { + if (o->TimerCheck(input, node)) { + if constexpr (buffer_enabled) { + outliers_.push_back(node); + } else if constexpr (has_delete) { + for (auto &p : node_map_[node]) { + point_map_.erase(p); + } + node_map_.erase(node); } - out_timer.Tock(); + del.push_back(node); + } + } + for (auto &node : del) { + d->Remove(node); } - win_timer.Tick(); - OutputOnline(online_centers); - d = GenericFactory::New(param); - d->Init(); - outliers_ = {}; - win_timer.Tock(); + out_timer.Tock(); + } } - if constexpr (has_update) - { - win_timer.Tick(); - if (w->Update()) - { - d->ForEach([&](NodePtr node) { w->Update(node); }); - if constexpr (buffer_enabled) - { - std::ranges::for_each(outliers_, [&](NodePtr node) { w->Update(node); }); - } + } else { + if constexpr (is_coreset_tree) { + out_timer.Tick(); + auto &cls = d->clusters(); + for (auto &node : cls) { + if (o->TimerCheck(input, node)) { + if constexpr (buffer_enabled) { + outliers_.push_back(node); + } + d->Remove(node); + } + } + if constexpr (timer_enabled && buffer_enabled) { + std::vector del; + for (auto &out : outliers_) { + if (o->TimerCheck(input, out)) { + del.push_back(out); + } } - win_timer.Tock(); + for (auto &out : del) { + std::ranges::remove_if(outliers_, + [&](const NodePtr &n) { return n == out; }); + } + } + out_timer.Tock(); } - if constexpr (has_delete) - { - win_timer.Tick(); - PointPtr point = w->Delete(); - if (point != nullptr) - { - assert(point_map_.contains(point)); - auto node = point_map_[point]; - if (node != nullptr) - { - node->Update(point->Reverse(), true); - point_map_.erase(point); - node_map_[node].erase(point); - if constexpr (buffer_enabled) - { - if (node->cf.num == 0) - { - // TODO - const auto [first, last] = std::ranges::remove_if( - outliers_, [&](const NodePtr &n) { return n->cf.num == 0; }); - outliers_.erase(first, last); - } - } - } + win_timer.Tick(); + OutputOnline(online_centers); + d = GenericFactory::New(param); + d->Init(); + outliers_ = {}; + win_timer.Tock(); + } + if constexpr (has_update) { + win_timer.Tick(); + if (w->Update()) { + d->ForEach([&](NodePtr node) { w->Update(node); }); + if constexpr (buffer_enabled) { + std::ranges::for_each(outliers_, + [&](NodePtr node) { w->Update(node); }); + } + } + win_timer.Tock(); + } + if constexpr (has_delete) { + win_timer.Tick(); + PointPtr point = w->Delete(); + if (point != nullptr) { + assert(point_map_.contains(point)); + auto node = point_map_[point]; + if (node != nullptr) { + node->Update(point->Reverse(), true); + point_map_.erase(point); + node_map_[node].erase(point); + if constexpr (buffer_enabled) { + if (node->cf.num == 0) { + // TODO + const auto [first, last] = std::ranges::remove_if( + outliers_, [&](const NodePtr &n) { return n->cf.num == 0; }); + outliers_.erase(first, last); + } } - win_timer.Tock(); + } } - lat_timer.Add(input->toa); + win_timer.Tock(); + } + lat_timer.Add(input->toa); } template -void StreamClustering::RunOffline(DataSinkPtr ptr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - OutputOnline(online_centers); - std::cout << "cluster_size: " << cluster_size_ << std::endl; - std::cout << "outlier_size: " << outlier_size_ << std::endl; - r->Run(param, online_centers, ptr); - ref_timer.Tock(); - sum_timer.Tock(); + requires StreamClusteringConcept +void StreamClustering::RunOffline(DataSinkPtr ptr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + OutputOnline(online_centers); + std::cout << "cluster_size: " << cluster_size_ << std::endl; + std::cout << "outlier_size: " << outlier_size_ << std::endl; + r->Run(param, online_centers, ptr); + ref_timer.Tock(); + sum_timer.Tock(); } template -StreamClustering::NodePtr StreamClustering::InsertOutliers(PointPtr point) -{ - if (outliers_.empty()) - { - auto node = GenericFactory::New(d, point); - node->index = 0; - node->timestamp = point->index; - outliers_.push_back(node); - return node; - } - else - { - auto closest = CalcClosestNode(outliers_, point); - if (closest.second < param.distance_threshold) - { - closest.first->Update(point); - closest.first->timestamp = point->index; - return closest.first; - } - else - { - auto node = GenericFactory::New(d, point); - node->index = outliers_.size(); - outliers_.push_back(node); - node->timestamp = point->index; - return node; - } + requires StreamClusteringConcept +StreamClustering::NodePtr +StreamClustering::InsertOutliers(PointPtr point) { + if (outliers_.empty()) { + auto node = GenericFactory::New(d, point); + node->index = 0; + node->timestamp = point->index; + outliers_.push_back(node); + return node; + } else { + auto closest = CalcClosestNode(outliers_, point); + if (closest.second < param.distance_threshold) { + closest.first->Update(point); + closest.first->timestamp = point->index; + return closest.first; + } else { + auto node = GenericFactory::New(d, point); + node->index = outliers_.size(); + outliers_.push_back(node); + node->timestamp = point->index; + return node; } + } } template -void StreamClustering::OutputOnline(std::vector ¢ers) -{ - // std::cerr << "Generic OutputOnline: " << d->clusters().size() << std::endl; - auto clusters = d->clusters(); - cluster_size_ += clusters.size(); - outlier_size_ += outliers_.size(); - for (int i = 0; i < clusters.size(); i++) - { - auto centroid = GenericFactory::New(param.dim, i, 1, 0); - for (int j = 0; j < param.dim; j++) - { - centroid->feature[j] = clusters[i]->cf.ls[j] / clusters[i]->cf.num; - } - centers.push_back(centroid); + requires StreamClusteringConcept +void StreamClustering::OutputOnline( + std::vector ¢ers) { + // std::cerr << "Generic OutputOnline: " << d->clusters().size() << std::endl; + auto clusters = d->clusters(); + cluster_size_ += clusters.size(); + outlier_size_ += outliers_.size(); + for (int i = 0; i < clusters.size(); i++) { + auto centroid = GenericFactory::New(param.dim, i); + for (int j = 0; j < param.dim; j++) { + centroid->feature[j] = clusters[i]->cf.ls[j] / clusters[i]->cf.num; } - for (int i = 0; i < outliers_.size(); ++i) - { - auto centroid = GenericFactory::New(param.dim, i, 1, 0); - for (int j = 0; j < param.dim; j++) - { - centroid->feature[j] = outliers_[i]->cf.ls[j] / outliers_[i]->cf.num; - } - centers.push_back(centroid); + centers.push_back(centroid); + } + for (int i = 0; i < outliers_.size(); ++i) { + auto centroid = GenericFactory::New(param.dim, i); + for (int j = 0; j < param.dim; j++) { + centroid->feature[j] = outliers_[i]->cf.ls[j] / outliers_[i]->cf.num; } + centers.push_back(centroid); + } } -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_ALGORITHM_HPP_ +#endif // SESAME_INCLUDE_ALGORITHM_ALGORITHM_HPP_ diff --git a/include/Algorithm/DesignAspect/Param.hpp b/include/Algorithm/DesignAspect/Param.hpp index fca67456..944b3d00 100644 --- a/include/Algorithm/DesignAspect/Param.hpp +++ b/include/Algorithm/DesignAspect/Param.hpp @@ -10,8 +10,7 @@ #include "Utils/BenchmarkUtils.hpp" -namespace SESAME -{ +namespace SESAME { using StreamClusteringParam = param_t; @@ -47,6 +46,6 @@ using StreamClusteringParam = param_t; // double neighbor_distance, outlier_density_threshold; // }; -} // namespace SESAME +} // namespace SESAME #endif \ No newline at end of file diff --git a/include/Algorithm/DesignAspect/V10.hpp b/include/Algorithm/DesignAspect/V10.hpp index 8f28da87..5e1329cf 100644 --- a/include/Algorithm/DesignAspect/V10.hpp +++ b/include/Algorithm/DesignAspect/V10.hpp @@ -14,54 +14,50 @@ #include #include -namespace SESAME -{ -class V10Parameter : public SesameParam -{ +namespace SESAME { +class V10Parameter : public SesameParam { public: - bool isInit = false; - - double alpha; - double lamda; - double beta; - int num_cache; - double radius; - int landmark; - double minDelta; - int opt; + bool isInit = false; + + double alpha; + double lamda; + double beta; + int num_cache; + double radius; + int landmark; + double minDelta; + int opt; }; -class V10 : public Algorithm -{ +class V10 : public Algorithm { public: - double deltaT; - int actCluMaxNum = 10000; - double minRho; - double alpha; - - V10Parameter V10Param; - DPTreePtr dpTree; - OutPtr outres; - CachePtr cache; - std::vector onlineCenters; - std::unordered_set clusters; - - V10(param_t &cmd_params); - ~V10(); - void Init() override; - void setMinDelta(double minDelta); - void CountNode(const SESAME::DPNodePtr &node, int &num); - void InitDP(double time); - SESAME::DPNodePtr streamProcess(SESAME::PointPtr p, int opt, double time); - double computeAlpha(); - double adjustMinDelta(); - void delCluster(); - SESAME::DPNodePtr retrive(SESAME::PointPtr p, int opt, double time); - - void RunOnline(SESAME::PointPtr input) override; - - void RunOffline(DataSinkPtr sinkPtr) override; - void OutputOnline(std::vector &onlineCenters) override; + double deltaT; + int actCluMaxNum = 10000; + double minRho; + double alpha; + + V10Parameter V10Param; + DPTreePtr dpTree; + OutPtr outres; + CachePtr cache; + std::vector onlineCenters; + std::unordered_set clusters; + + V10(param_t &cmd_params); + void Init() override; + void setMinDelta(double minDelta); + void CountNode(const SESAME::DPNodePtr &node, int &num); + void InitDP(double time); + SESAME::DPNodePtr streamProcess(SESAME::PointPtr p, int opt, double time); + double computeAlpha(); + double adjustMinDelta(); + void delCluster(); + SESAME::DPNodePtr retrive(SESAME::PointPtr p, int opt, double time); + + void RunOnline(SESAME::PointPtr input) override; + + void RunOffline(DataSinkPtr sinkPtr) override; + void OutputOnline(std::vector &onlineCenters) override; }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_DESIGNASPECT_V10_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_DESIGNASPECT_V10_HPP_ diff --git a/include/Algorithm/DesignAspect/V16.hpp b/include/Algorithm/DesignAspect/V16.hpp index 0ad780f6..bcffc60b 100644 --- a/include/Algorithm/DesignAspect/V16.hpp +++ b/include/Algorithm/DesignAspect/V16.hpp @@ -15,77 +15,86 @@ #include -/* This algorithm is composed by landmark window + Grids + Outlier detection with buffer + no - * refinement Note: we have removed all of the unnecessary modules since subsituting damped window - * with the landmark one makes the algorithm much more simple - * 1. the dm, dl, gap under landmark window is unchanged, which is cm, cl, cm - cl, following the - * assumption and calculation equation of 25 in DStream paper (equation 11 is not needed since the - * weight will not decay in landmark window). - * 2. the removeSporadic and checkIfSporadic function can be removed since the algorithm does not - * use outlier detection with timer - * 3. at this time, we donnot need to recalculate the parameter and thus ifReCalculate and - * reCalculateParameter function can also be removed - * 4. we remove outlier_density_thresholdFunction function and set lambda = = 1 for transfer - * convenience. Even if the weight is updated it is still the same. - * 5. In original DStream, it does not specifically set a buffer to store the outlier grid, however, - * it separate those outlier with the normal grid using a "SPARSE" label and still store them in the - * grid_list rather than deleting them. So we still treat it as using outlier buffer +/* This algorithm is composed by landmark window + Grids + Outlier detection + * with buffer + no refinement Note: we have removed all of the unnecessary + * modules since subsituting damped window with the landmark one makes the + * algorithm much more simple + * 1. the dm, dl, gap under landmark window is unchanged, which is cm, cl, cm - + * cl, following the assumption and calculation equation of 25 in DStream paper + * (equation 11 is not needed since the weight will not decay in landmark + * window). + * 2. the removeSporadic and checkIfSporadic function can be removed since the + * algorithm does not use outlier detection with timer + * 3. at this time, we donnot need to recalculate the parameter and thus + * ifReCalculate and reCalculateParameter function can also be removed + * 4. we remove outlier_density_thresholdFunction function and set lambda = = 1 + * for transfer convenience. Even if the weight is updated it is still the same. + * 5. In original DStream, it does not specifically set a buffer to store the + * outlier grid, however, it separate those outlier with the normal grid using a + * "SPARSE" label and still store them in the grid_list rather than deleting + * them. So we still treat it as using outlier buffer * */ -namespace SESAME -{ +namespace SESAME { -typedef std::unordered_map HashMap; -class V16 : public Algorithm -{ +typedef std::unordered_map + HashMap; +class V16 : public Algorithm { public: - int currentTimeStamp = 0; - int gap; // Time gap between calls to the offline component - double dm; // Density threshold for dense grids; controlled by cm - double dl; // Density threshold for sparse grids; controlled by cl - HashMap gridList; - // Store the deleted sporadic grids: - std::vector clusterList; // A list of all Grid Clusters - std::vector - newClusterList; // A list of grid clusters used when re-clustering an existing cluster. - std::vector minVals; // The minimum value seen for a numerical dim; used to calculate N - std::vector maxVals; // The maximum value seen for a numerical dim; used to calculate N - bool init = false; - std::vector Coord; - std::vector onlineCenters; - int q = 0; - std::vector windowGrid; + int currentTimeStamp = 0; + int gap; // Time gap between calls to the offline component + double dm; // Density threshold for dense grids; controlled by cm + double dl; // Density threshold for sparse grids; controlled by cl + HashMap gridList; + // Store the deleted sporadic grids: + std::vector clusterList; // A list of all Grid Clusters + std::vector newClusterList; // A list of grid clusters used when + // re-clustering an existing cluster. + std::vector minVals; // The minimum value seen for a numerical dim; + // used to calculate N + std::vector maxVals; // The maximum value seen for a numerical dim; + // used to calculate N + bool init = false; + std::vector Coord; + std::vector onlineCenters; + int q = 0; + std::vector windowGrid; - V16(param_t &cmd_params); - ~V16(); - void Init(); - void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; - void OutputOnline(std::vector &onlineCenters) override; + V16(param_t &cmd_params); + ~V16(); + void Init(); + void RunOnline(PointPtr input) override; + void RunOffline(DataSinkPtr sinkPtr) override; + void OutputOnline(std::vector &onlineCenters) override; private: - void GridListUpdate(const std::vector &coordinate); - void initialClustering(); - void adjustClustering(); - bool adjustLabels(); - bool inspectChangedGrids(); - void RemoveWindowPointFromGrid(); - void calculateGridCoord(PointPtr point); - HashMap adjustForSparseGrid(const DensityGrid &grid, CharacteristicVector characteristicVec, - int gridClass); - HashMap adjustForDenseGrid(const DensityGrid &grid, CharacteristicVector characteristicVec, - int gridClass); - HashMap adjustForTransitionalGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, int gridClass); - HashMap reCluster(GridCluster &gridCluster); - HashMap adjustNewLabels(const HashMap &newGridList); - void mergeClusters(int smallCluster, int bigCluster); - void cleanClusters(); - HashMap cleanNewClusters(HashMap newGridList); - HashMap mergeNewClusters(HashMap newGridList, int smallCluster, int bigCluster); - void updateGridListDensity(); - static void mergeGridList(HashMap &gridList, const HashMap &otherList); - bool checkIfSporadic(CharacteristicVector characteristicVec); - void removeSporadic(); + void GridListUpdate(const std::vector &coordinate); + void initialClustering(); + void adjustClustering(); + bool adjustLabels(); + bool inspectChangedGrids(); + void RemoveWindowPointFromGrid(); + void calculateGridCoord(PointPtr point); + HashMap adjustForSparseGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap adjustForDenseGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap adjustForTransitionalGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap reCluster(GridCluster &gridCluster); + HashMap adjustNewLabels(const HashMap &newGridList); + void mergeClusters(int smallCluster, int bigCluster); + void cleanClusters(); + HashMap cleanNewClusters(HashMap newGridList); + HashMap mergeNewClusters(HashMap newGridList, int smallCluster, + int bigCluster); + void updateGridListDensity(); + static void mergeGridList(HashMap &gridList, const HashMap &otherList); + bool checkIfSporadic(CharacteristicVector characteristicVec); + void removeSporadic(); }; -} // namespace SESAME -#endif // COVERTBIRCH_FILE_INCLUDE_ALGORITHM_DESIGNASPECT_V16_HPP_ +} // namespace SESAME +#endif // COVERTBIRCH_FILE_INCLUDE_ALGORITHM_DESIGNASPECT_V16_HPP_ diff --git a/include/Algorithm/DesignAspect/V9.hpp b/include/Algorithm/DesignAspect/V9.hpp index ad9bb0ad..84f676ab 100644 --- a/include/Algorithm/DesignAspect/V9.hpp +++ b/include/Algorithm/DesignAspect/V9.hpp @@ -15,75 +15,84 @@ #include -/* This algorithm is composed by landmark window + Grids + Outlier detection with buffer + no - * refinement Note: we have removed all of the unnecessary modules since subsituting damped window - * with the landmark one makes the algorithm much more simple - * 1. the dm, dl, gap under landmark window is unchanged, which is cm, cl, cm - cl, following the - * assumption and calculation equation of 25 in DStream paper (equation 11 is not needed since the - * weight will not decay in landmark window). - * 2. the removeSporadic and checkIfSporadic function can be removed since the algorithm does not - * use outlier detection with timer - * 3. at this time, we donnot need to recalculate the parameter and thus ifReCalculate and - * reCalculateParameter function can also be removed - * 4. we remove outlier_density_thresholdFunction function and set lambda = = 1 for transfer - * convenience. Even if the weight is updated it is still the same. - * 5. In original DStream, it does not specifically set a buffer to store the outlier grid, however, - * it separate those outlier with the normal grid using a "SPARSE" label and still store them in the - * grid_list rather than deleting them. So we still treat it as using outlier buffer +/* This algorithm is composed by landmark window + Grids + Outlier detection + * with buffer + no refinement Note: we have removed all of the unnecessary + * modules since subsituting damped window with the landmark one makes the + * algorithm much more simple + * 1. the dm, dl, gap under landmark window is unchanged, which is cm, cl, cm - + * cl, following the assumption and calculation equation of 25 in DStream paper + * (equation 11 is not needed since the weight will not decay in landmark + * window). + * 2. the removeSporadic and checkIfSporadic function can be removed since the + * algorithm does not use outlier detection with timer + * 3. at this time, we donnot need to recalculate the parameter and thus + * ifReCalculate and reCalculateParameter function can also be removed + * 4. we remove outlier_density_thresholdFunction function and set lambda = = 1 + * for transfer convenience. Even if the weight is updated it is still the same. + * 5. In original DStream, it does not specifically set a buffer to store the + * outlier grid, however, it separate those outlier with the normal grid using a + * "SPARSE" label and still store them in the grid_list rather than deleting + * them. So we still treat it as using outlier buffer * */ -namespace SESAME -{ +namespace SESAME { -typedef std::unordered_map HashMap; -class V9 : public Algorithm -{ +typedef std::unordered_map + HashMap; +class V9 : public Algorithm { public: - int currentTimeStamp = 0; - int lastLandmark = 0; - int gap; // Time gap between calls to the offline component - double dm; // Density threshold for dense grids; controlled by cm - double dl; // Density threshold for sparse grids; controlled by cl - HashMap gridList; - // Store the deleted sporadic grids: - std::vector clusterList; // A list of all Grid Clusters - std::vector - newClusterList; // A list of grid clusters used when re-clustering an existing cluster. - std::vector minVals; // The minimum value seen for a numerical dim; used to calculate N - std::vector maxVals; // The maximum value seen for a numerical dim; used to calculate N - bool init = false; - std::vector Coord; - std::vector onlineCenters; - int q = 0; + int currentTimeStamp = 0; + int lastLandmark = 0; + int gap; // Time gap between calls to the offline component + double dm; // Density threshold for dense grids; controlled by cm + double dl; // Density threshold for sparse grids; controlled by cl + HashMap gridList; + // Store the deleted sporadic grids: + std::vector clusterList; // A list of all Grid Clusters + std::vector newClusterList; // A list of grid clusters used when + // re-clustering an existing cluster. + std::vector minVals; // The minimum value seen for a numerical dim; + // used to calculate N + std::vector maxVals; // The maximum value seen for a numerical dim; + // used to calculate N + bool init = false; + std::vector Coord; + std::vector onlineCenters; + int q = 0; - V9(param_t &cmd_params); - ~V9(); - void Init(); - void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + V9(param_t &cmd_params); + ~V9(); + void Init(); + void RunOnline(PointPtr input) override; + void RunOffline(DataSinkPtr sinkPtr) override; private: - void GridListUpdate(const std::vector &coordinate); - void initialClustering(); - void adjustClustering(); - bool adjustLabels(); - bool inspectChangedGrids(); - void calculateGridCoord(PointPtr point); - HashMap adjustForSparseGrid(const DensityGrid &grid, CharacteristicVector characteristicVec, - int gridClass); - HashMap adjustForDenseGrid(const DensityGrid &grid, CharacteristicVector characteristicVec, - int gridClass); - HashMap adjustForTransitionalGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, int gridClass); - HashMap reCluster(GridCluster &gridCluster); - HashMap adjustNewLabels(const HashMap &newGridList); - void mergeClusters(int smallCluster, int bigCluster); - void cleanClusters(); - HashMap cleanNewClusters(HashMap newGridList); - HashMap mergeNewClusters(HashMap newGridList, int smallCluster, int bigCluster); - void updateGridListDensity(); - static void mergeGridList(HashMap &gridList, const HashMap &otherList); - bool checkIfSporadic(CharacteristicVector characteristicVec); - void removeSporadic(); + void GridListUpdate(const std::vector &coordinate); + void initialClustering(); + void adjustClustering(); + bool adjustLabels(); + bool inspectChangedGrids(); + void calculateGridCoord(PointPtr point); + HashMap adjustForSparseGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap adjustForDenseGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap adjustForTransitionalGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass); + HashMap reCluster(GridCluster &gridCluster); + HashMap adjustNewLabels(const HashMap &newGridList); + void mergeClusters(int smallCluster, int bigCluster); + void cleanClusters(); + HashMap cleanNewClusters(HashMap newGridList); + HashMap mergeNewClusters(HashMap newGridList, int smallCluster, + int bigCluster); + void updateGridListDensity(); + static void mergeGridList(HashMap &gridList, const HashMap &otherList); + bool checkIfSporadic(CharacteristicVector characteristicVec); + void removeSporadic(); }; -} // namespace SESAME -#endif // COVERTBIRCH_FILE_INCLUDE_ALGORITHM_DESIGNASPECT_V9_HPP_ +} // namespace SESAME +#endif // COVERTBIRCH_FILE_INCLUDE_ALGORITHM_DESIGNASPECT_V9_HPP_ diff --git a/include/Algorithm/EDMStream.hpp b/include/Algorithm/EDMStream.hpp index 48d8b222..33ba6719 100644 --- a/include/Algorithm/EDMStream.hpp +++ b/include/Algorithm/EDMStream.hpp @@ -14,52 +14,48 @@ #include #include -namespace SESAME -{ -class EDMParameter : public SesameParam -{ +namespace SESAME { +class EDMParameter : public SesameParam { public: - bool isInit = false; + bool isInit = false; - double alpha; - double lamda; - double beta; - int num_cache; - double radius; + double alpha; + double lamda; + double beta; + int num_cache; + double radius; - double minDelta; - int opt; + double minDelta; + int opt; }; -class EDMStream : public Algorithm -{ +class EDMStream : public Algorithm { public: - double deltaT; - int actCluMaxNum = 10000; - double minRho; - double alpha; - - EDMParameter EDMParam; - DPTreePtr dpTree; - OutPtr outres; - CachePtr cache; - std::unordered_set clusters; - - EDMStream(param_t &cmd_params); - ~EDMStream(); - void Init() override; - void setMinDelta(double minDelta); - void CountNode(const SESAME::DPNodePtr &node, int &num); - void InitDP(double time); - SESAME::DPNodePtr streamProcess(SESAME::PointPtr p, int opt, double time); - double computeAlpha(); - double adjustMinDelta(); - void delCluster(); - SESAME::DPNodePtr retrive(SESAME::PointPtr p, int opt, double time); - - void RunOnline(SESAME::PointPtr input) override; - - void RunOffline(DataSinkPtr sinkPtr) override; + double deltaT; + int actCluMaxNum = 10000; + double minRho; + double alpha; + + EDMParameter EDMParam; + DPTreePtr dpTree; + OutPtr outres; + CachePtr cache; + std::unordered_set clusters; + + EDMStream(param_t &cmd_params); + void Init() override; + void setMinDelta(double minDelta); + void CountNode(const SESAME::DPNodePtr &node, int &num); + void InitDP(double time); + SESAME::DPNodePtr streamProcess(SESAME::PointPtr p, int opt, double time); + double computeAlpha(); + double adjustMinDelta(); + void delCluster(); + SESAME::DPNodePtr retrive(SESAME::PointPtr p, int opt, double time); + + void RunOnline(SESAME::PointPtr input) override; + + void RunOffline(DataSinkPtr sinkPtr) override; }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_EDMSTREAM_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_EDMSTREAM_HPP_ diff --git a/include/Algorithm/OfflineRefinement/ConnectedRegions.hpp b/include/Algorithm/OfflineRefinement/ConnectedRegions.hpp index 94110021..d90d7aa6 100644 --- a/include/Algorithm/OfflineRefinement/ConnectedRegions.hpp +++ b/include/Algorithm/OfflineRefinement/ConnectedRegions.hpp @@ -12,45 +12,45 @@ #include "Algorithm/OfflineRefinement/OfflineRefinement.hpp" #include "Utils/Logger.hpp" -namespace SESAME -{ +namespace SESAME { -class ConnectedRegions : public OfflineRefinement -{ +class ConnectedRegions : public OfflineRefinement { public: - double alpha; // intersection factor, alpha - double min_weight; // minimum weight - std::vector> finalClusters; - unordered_map> connecvtivityGraphId; - ConnectedRegions(); - ConnectedRegions(double alpha, double min_weight); - void connection(std::vector µClusters, - SESAME::WeightedAdjacencyList &weightedAdjacencyList); - std::vector ResultsToDataSink(); + double alpha; // intersection factor, alpha + double min_weight; // minimum weight + std::vector> finalClusters; + unordered_map> connecvtivityGraphId; + ConnectedRegions(); + ConnectedRegions(double alpha, double min_weight); + void connection(std::vector µClusters, + SESAME::WeightedAdjacencyList &weightedAdjacencyList); + std::vector ResultsToDataSink(); - /** - * @Description: insert vertices and entries into connectivity graph when - * micro cluster pair connectivity value greater than the intersection - * threshold if the graph has testing micro cluster, add connected strong MC - * in the corresponding entries else, create new V,E into the graph - * @Param: connectivity graph, micro cluster 1 and 2 - * @Return: void - */ - void insertIntoGraph(const std::vector µClusters, int microClusterId, - int OtherId); - void insertIntoGraph(const std::vector µClusters, int microClusterId); - /** - * @Description: findConnectedComponents function visit the existing - * connectivity graph and find all connected strong MCs that will finally form - * arbitrary-shaped macro clusters each macro cluster will be stored as a - * vector of micro clusters, which will be transformed into point that stores - * in sink later - * @Param: connectivity graph - * @Return: void - */ - void findConnectedComponents(const std::vector µClusters); + /** + * @Description: insert vertices and entries into connectivity graph when + * micro cluster pair connectivity value greater than the intersection + * threshold if the graph has testing micro cluster, add connected strong MC + * in the corresponding entries else, create new V,E into the graph + * @Param: connectivity graph, micro cluster 1 and 2 + * @Return: void + */ + void insertIntoGraph(const std::vector µClusters, + int microClusterId, int OtherId); + void insertIntoGraph(const std::vector µClusters, + int microClusterId); + /** + * @Description: findConnectedComponents function visit the existing + * connectivity graph and find all connected strong MCs that will finally form + * arbitrary-shaped macro clusters each macro cluster will be stored as a + * vector of micro clusters, which will be transformed into point that stores + * in sink later + * @Param: connectivity graph + * @Return: void + */ + void + findConnectedComponents(const std::vector µClusters); }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_CONNECTEDREGIONS_HPP_ +#endif // SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_CONNECTEDREGIONS_HPP_ diff --git a/include/Algorithm/OfflineRefinement/DBSCAN.hpp b/include/Algorithm/OfflineRefinement/DBSCAN.hpp index a8bd62ce..20f897e9 100644 --- a/include/Algorithm/OfflineRefinement/DBSCAN.hpp +++ b/include/Algorithm/OfflineRefinement/DBSCAN.hpp @@ -4,8 +4,8 @@ #ifndef SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_DBSCAN_HPP_ #define SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_DBSCAN_HPP_ #include "Algorithm/DataStructure/Point.hpp" -#include "Algorithm/Param.hpp" #include "Algorithm/OfflineRefinement/OfflineRefinement.hpp" +#include "Algorithm/Param.hpp" #include "Sinks/DataSink.hpp" #include "Utils/UtilityFunctions.hpp" @@ -18,36 +18,37 @@ #include #include -namespace SESAME -{ +namespace SESAME { -//#define BORDER_POINT 2 -class DBSCAN : public OfflineRefinement -{ +// #define BORDER_POINT 2 +class DBSCAN : public OfflineRefinement { public: - DBSCAN(const SesameParam ¶m) {} - DBSCAN(unsigned int minPts, float eps); - DBSCAN(); - ~DBSCAN(); - void Run(SesameParam ¶m, std::vector &input, DataSinkPtr sinkPtr); + DBSCAN(const SesameParam ¶m) {} + DBSCAN(unsigned int minPts, float eps); + DBSCAN(); + ~DBSCAN(); + void Run(SesameParam ¶m, std::vector &input, + DataSinkPtr sinkPtr); - void run(std::vector &input); - void produceResult(std::vector &input, DataSinkPtr sinkPtr); + void run(std::vector &input); + void produceResult(std::vector &input, DataSinkPtr sinkPtr); private: - std::vector calculateCluster(std::vector &input, PointPtr &point) const; - int expandCluster(std::vector &input, PointPtr &point, int clusterID) const; - static bool judgeCorePoint(PointPtr &point, PointPtr &other); - static double calculateEluDistance(PointPtr &point, PointPtr &other); - // Obtain private members - unsigned int getTotalPointSize() const { return pointSize; } - unsigned int getMinimumClusterSize() const { return min_points; } - double getEpsilonSize() const { return epsilon; } - int getClusterID() const { return clusterID; } - unsigned int pointSize; - unsigned int min_points; - int clusterID; - double epsilon; + std::vector calculateCluster(std::vector &input, + PointPtr &point) const; + int expandCluster(std::vector &input, PointPtr &point, + int clusterID) const; + static bool judgeCorePoint(PointPtr &point, PointPtr &other); + static double calculateEluDistance(PointPtr &point, PointPtr &other); + // Obtain private members + unsigned int getTotalPointSize() const { return pointSize; } + unsigned int getMinimumClusterSize() const { return min_points; } + double getEpsilonSize() const { return epsilon; } + int getClusterID() const { return clusterID; } + unsigned int pointSize; + unsigned int min_points; + int clusterID; + double epsilon; }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_DBSCAN_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_DBSCAN_HPP_ diff --git a/include/Algorithm/OfflineRefinement/KMeans.hpp b/include/Algorithm/OfflineRefinement/KMeans.hpp index 6db93ead..68937246 100644 --- a/include/Algorithm/OfflineRefinement/KMeans.hpp +++ b/include/Algorithm/OfflineRefinement/KMeans.hpp @@ -9,8 +9,8 @@ #define SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_KMEANS_HPP_ #include "Algorithm/DataStructure/Point.hpp" -#include "Algorithm/Param.hpp" #include "Algorithm/OfflineRefinement/OfflineRefinement.hpp" +#include "Algorithm/Param.hpp" #include "Sinks/DataSink.hpp" #include "Utils/Logger.hpp" @@ -23,41 +23,47 @@ #include #include -namespace SESAME -{ -class KMeans : public OfflineRefinement -{ +namespace SESAME { +class KMeans : public OfflineRefinement { public: - KMeans() {} - KMeans(const SesameParam ¶m) {} - // TODO: use template here - void Run(SesameParam ¶m, std::vector &online_centers, - DataSinkPtr sinkPtr); - void Run(SesameParam ¶m, std::vector &online_centers, - std::vector &results); - void produceResult(std::vector> &groups, DataSinkPtr sinkPtr); - void runKMeans(int numberOfCenters, int numberOfInput, std::vector ¢ers, - std::vector &input, std::vector> &oldGroups, - std::vector> &newGroups, int seed, bool KMeansPP); - void storeResult(std::vector> &groups, std::vector &output); + KMeans() {} + KMeans(const SesameParam ¶m) {} + // TODO: use template here + void Run(SesameParam ¶m, std::vector &online_centers, + DataSinkPtr sinkPtr); + void Run(SesameParam ¶m, std::vector &online_centers, + std::vector &results); + void produceResult(std::vector> &groups, + DataSinkPtr sinkPtr); + void runKMeans(int numberOfCenters, int numberOfInput, + std::vector ¢ers, std::vector &input, + std::vector> &oldGroups, + std::vector> &newGroups, int seed, + bool KMeansPP); + void storeResult(std::vector> &groups, + std::vector &output); private: - // Randomly chooses k centres with kMeans++ distribution - double calculateEluDistance(PointPtr &point, PointPtr ¢er); - void calculateClusterCenter(PointPtr ¢er, std::vector &group); - void randomSelectCenters(int numberOfCenters, int numberOfInput, std::vector &input, - std::vector ¢ers); - void selectCentersFromWeight(int numberOfCenters, int numberOfInput, - std::vector &input, std::vector ¢ers); - void groupPointsByCenters(int numberOfCenters, int numberOfInput, std::vector &input, - std::vector ¢ers, - std::vector> &groups); - void adjustClusteringCenters(std::vector ¢ers, - std::vector> &groups); - void refreshGroup(std::vector> &oldGroups, - std::vector> &newGroups); - void checkStopStatus(bool &flag, std::vector> &oldGroups, - std::vector> &newGroups); + // Randomly chooses k centres with kMeans++ distribution + double calculateEluDistance(PointPtr &point, PointPtr ¢er); + void calculateClusterCenter(PointPtr ¢er, std::vector &group); + void randomSelectCenters(int numberOfCenters, int numberOfInput, + std::vector &input, + std::vector ¢ers); + void selectCentersFromWeight(int numberOfCenters, int numberOfInput, + std::vector &input, + std::vector ¢ers); + void groupPointsByCenters(int numberOfCenters, int numberOfInput, + std::vector &input, + std::vector ¢ers, + std::vector> &groups); + void adjustClusteringCenters(std::vector ¢ers, + std::vector> &groups); + void refreshGroup(std::vector> &oldGroups, + std::vector> &newGroups); + void checkStopStatus(bool &flag, + std::vector> &oldGroups, + std::vector> &newGroups); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_KMEANS_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_OFFLINECLUSTERING_KMEANS_HPP_ diff --git a/include/Algorithm/OfflineRefinement/OfflineRefinement.hpp b/include/Algorithm/OfflineRefinement/OfflineRefinement.hpp index 80284d1b..e544ebe7 100644 --- a/include/Algorithm/OfflineRefinement/OfflineRefinement.hpp +++ b/include/Algorithm/OfflineRefinement/OfflineRefinement.hpp @@ -13,26 +13,22 @@ #include -namespace SESAME -{ -class OfflineRefinement -{ +namespace SESAME { +class OfflineRefinement { public: - void Run(SesameParam ¶m, const std::vector &input, DataSinkPtr sinkPtr) - { - for (size_t i = 0; i < input.size(); ++i) - { - input[i]->setClusteringCenter(i); - sinkPtr->put(input[i]); - } + void Run(SesameParam ¶m, const std::vector &input, + DataSinkPtr sinkPtr) { + for (size_t i = 0; i < input.size(); ++i) { + input[i]->setClusteringCenter(i); + sinkPtr->put(input[i]); } + } }; -class NoRefinement : public OfflineRefinement -{ +class NoRefinement : public OfflineRefinement { public: - NoRefinement(const SesameParam ¶m) {} + NoRefinement(const SesameParam ¶m) {} }; -} // namespace SESAME -#endif // SESAME_SRC_ALGORITHM_OFFLINE_CLUSTERING_HPP_ \ No newline at end of file +} // namespace SESAME +#endif // SESAME_SRC_ALGORITHM_OFFLINE_CLUSTERING_HPP_ \ No newline at end of file diff --git a/include/Algorithm/OutlierDetection/OutlierDetection.hpp b/include/Algorithm/OutlierDetection/OutlierDetection.hpp index 1b134309..155ebc47 100644 --- a/include/Algorithm/OutlierDetection/OutlierDetection.hpp +++ b/include/Algorithm/OutlierDetection/OutlierDetection.hpp @@ -10,178 +10,142 @@ #include "Algorithm/DataStructure/CFTree.hpp" #include "Algorithm/Param.hpp" -namespace SESAME -{ +namespace SESAME { -template -class OutlierDetection -{ - const int outlier_cap_; - const size_t interval_; +template class OutlierDetection { + const int outlier_cap_; + const size_t interval_; public: - static constexpr bool buffer_enabled = B; - static constexpr bool timer_enabled = T; + static constexpr bool buffer_enabled = B; + static constexpr bool timer_enabled = T; - OutlierDetection(const SesameParam ¶m) - : outlier_cap_(param.outlier_cap), interval_(param.clean_interval) - {} - template - bool Check(PointPtr point, std::vector &nodes) - { - return false; - } - template - bool Check(N node) - { - if (node != nullptr) return node->cf.num < outlier_cap_; - return false; - } - template - bool TimerCheck(PointPtr input, N node) - { - if (node == nullptr) return false; - bool is_outlier = node->cf.num < outlier_cap_; - if constexpr (timer_enabled) - { - if (input->index - node->timestamp < interval_) is_outlier = false; - } - return is_outlier; + OutlierDetection(const SesameParam ¶m) + : outlier_cap_(param.outlier_cap), interval_(param.clean_interval) {} + template bool Check(PointPtr point, std::vector &nodes) { + return false; + } + template bool Check(N node) { + if (node != nullptr) + return node->cf.num < outlier_cap_; + return false; + } + template bool TimerCheck(PointPtr input, N node) { + if (node == nullptr) + return false; + bool is_outlier = node->cf.num < outlier_cap_; + if constexpr (timer_enabled) { + if (input->index - node->timestamp < interval_) + is_outlier = false; } + return is_outlier; + } }; -template -class DistanceDetection -{ +template class DistanceDetection { private: - const double outlier_distance_threshold_; - const int outlier_cap_; - const size_t interval_; - size_t cnt_ = 0; + const double outlier_distance_threshold_; + const int outlier_cap_; + const size_t interval_; + size_t cnt_ = 0; public: - static constexpr bool buffer_enabled = B; - static constexpr bool timer_enabled = T; + static constexpr bool buffer_enabled = B; + static constexpr bool timer_enabled = T; - DistanceDetection(const SesameParam ¶m) - : outlier_distance_threshold_(param.outlier_distance_threshold), - outlier_cap_(param.outlier_cap), - interval_(param.time_interval) - {} - template - bool Check(PointPtr point, std::vector &nodes) - { - if (nodes.empty()) return false; - auto dist = CalcClosestNode(nodes, point).second; - return dist > outlier_distance_threshold_; - } - template - bool Check(N node) - { - if (node != nullptr) return node->cf.num < outlier_cap_; - return false; - } - template - bool TimerCheck(PointPtr input, N node) - { - if (node == nullptr) return false; - bool is_outlier = node->cf.num < outlier_cap_; - if constexpr (timer_enabled) - { - if (input->index - node->timestamp < interval_) is_outlier = false; - } - return is_outlier; + DistanceDetection(const SesameParam ¶m) + : outlier_distance_threshold_(param.outlier_distance_threshold), + outlier_cap_(param.outlier_cap), interval_(param.time_interval) {} + template bool Check(PointPtr point, std::vector &nodes) { + if (nodes.empty()) + return false; + auto dist = CalcClosestNode(nodes, point).second; + return dist > outlier_distance_threshold_; + } + template bool Check(N node) { + if (node != nullptr) + return node->cf.num < outlier_cap_; + return false; + } + template bool TimerCheck(PointPtr input, N node) { + if (node == nullptr) + return false; + bool is_outlier = node->cf.num < outlier_cap_; + if constexpr (timer_enabled) { + if (input->index - node->timestamp < interval_) + is_outlier = false; } + return is_outlier; + } }; -template -class DensityDetection -{ - const double neighbor_distance_, outlier_density_threshold_; - const int outlier_cap_; - const size_t interval_; - size_t cnt_ = 0; +template class DensityDetection { + const double neighbor_distance_, outlier_density_threshold_; + const int outlier_cap_; + const size_t interval_; + size_t cnt_ = 0; public: - static constexpr bool buffer_enabled = B; - static constexpr bool timer_enabled = T; + static constexpr bool buffer_enabled = B; + static constexpr bool timer_enabled = T; - DensityDetection(const SesameParam ¶m) - : neighbor_distance_(param.neighbor_distance), - outlier_density_threshold_(param.outlier_density_threshold), - outlier_cap_(param.outlier_cap), - interval_(param.time_interval) - {} - template - bool Check(PointPtr point, std::vector &nodes) - { - std::vector neighborNodes; - int neighborDensity = 0, neighborNeighborDensity = 0; - for (auto node : nodes) - { - auto dist = point->L2Dist(node->Centroid()); - if (dist < neighbor_distance_) - { - neighborNodes.push_back(node); - neighborDensity += node->cf.num; - } - } - for (auto neighbor : neighborNodes) - { - for (auto node : nodes) - { - if (CalcClusterL2Dist(node, neighbor) < neighbor_distance_) - { - neighborNeighborDensity += node->cf.num; - } - } - } - if (neighborNeighborDensity == 0) - return false; - else - return (double)neighborDensity / neighborNeighborDensity < outlier_density_threshold_; - } - template - bool Check(N node) - { - if (node != nullptr) return node->cf.num < outlier_cap_; - return false; + DensityDetection(const SesameParam ¶m) + : neighbor_distance_(param.neighbor_distance), + outlier_density_threshold_(param.outlier_density_threshold), + outlier_cap_(param.outlier_cap), interval_(param.time_interval) {} + template bool Check(PointPtr point, std::vector &nodes) { + std::vector neighborNodes; + int neighborDensity = 0, neighborNeighborDensity = 0; + for (auto node : nodes) { + auto dist = point->L2Dist(node->Centroid()); + if (dist < neighbor_distance_) { + neighborNodes.push_back(node); + neighborDensity += node->cf.num; + } } - template - bool TimerCheck(PointPtr input, N node) - { - if (node == nullptr) return false; - bool is_outlier = node->cf.num < outlier_cap_; - if constexpr (timer_enabled) - { - if (input->index - node->timestamp < interval_) is_outlier = false; + for (auto neighbor : neighborNodes) { + for (auto node : nodes) { + if (CalcClusterL2Dist(node, neighbor) < neighbor_distance_) { + neighborNeighborDensity += node->cf.num; } - return is_outlier; + } + } + if (neighborNeighborDensity == 0) + return false; + else + return (double)neighborDensity / neighborNeighborDensity < + outlier_density_threshold_; + } + template bool Check(N node) { + if (node != nullptr) + return node->cf.num < outlier_cap_; + return false; + } + template bool TimerCheck(PointPtr input, N node) { + if (node == nullptr) + return false; + bool is_outlier = node->cf.num < outlier_cap_; + if constexpr (timer_enabled) { + if (input->index - node->timestamp < interval_) + is_outlier = false; } + return is_outlier; + } }; -class NoDetection -{ +class NoDetection { public: - static constexpr bool buffer_enabled = false; - static constexpr bool timer_enabled = false; - NoDetection(const SesameParam ¶m) {} - template - bool Check(PointPtr point, std::vector &nodes) - { - return false; - } - template - bool Check(N node) - { - return false; - } - template - bool TimerCheck(PointPtr input, N node) - { - return false; - } + static constexpr bool buffer_enabled = false; + static constexpr bool timer_enabled = false; + NoDetection(const SesameParam ¶m) {} + template bool Check(PointPtr point, std::vector &nodes) { + return false; + } + template bool Check(N node) { return false; } + template bool TimerCheck(PointPtr input, N node) { + return false; + } }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_OUTLIERDETECTION_OUTLIERDETECTION_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_OUTLIERDETECTION_OUTLIERDETECTION_HPP_ diff --git a/include/Algorithm/Param.hpp b/include/Algorithm/Param.hpp index 51c8fdd3..9c57407b 100644 --- a/include/Algorithm/Param.hpp +++ b/include/Algorithm/Param.hpp @@ -3,191 +3,194 @@ #include #include -#include #include +#include -namespace SESAME -{ -enum AlgoType -{ - BirchType = 0, - StreamKMeansType = 1, - CluStreamType = 2, - DenStreamType = 3, - DBStreamType = 4, - EDMStreamType = 5, - DStreamType = 6, - SLKMeansType = 7, - BenneType = 8, - G1Stream = 21, - G2Stream, - G3Stream, - G4Stream, - G5Stream, - G6Stream, - G7Stream, - G8Stream, - G9Stream, - G10Stream, - G11Stream, - G12Stream, - G13Stream, - G14Stream, - G15Stream, - G16Stream +namespace SESAME { +enum AlgoType { + BirchType = 0, + StreamKMeansType = 1, + CluStreamType = 2, + DenStreamType = 3, + DBStreamType = 4, + EDMStreamType = 5, + DStreamType = 6, + SLKMeansType = 7, + BenneType = 8, + G1Stream = 21, + G2Stream, + G3Stream, + G4Stream, + G5Stream, + G6Stream, + G7Stream, + G8Stream, + G9Stream, + G10Stream, + G11Stream, + G12Stream, + G13Stream, + G14Stream, + G15Stream, + G16Stream }; extern char const *algo_names[64]; extern char const *benne_suffix[4]; -enum BenneObj -{ - balance = 0, - accuracy = 1, - efficiency = 2, - accuracy_no_migration = 3 +enum BenneObj { + balance = 0, + accuracy = 1, + efficiency = 2, + accuracy_no_migration = 3 }; -struct BenneThreshold -{ - int queue_size = 10000; // queue size for auto detection - int dim = 30; // above is high dimension - double variance = 100; // above is high concept drift - int outliers_num = 200; // above is many outliers - double outliers_dist = 50; // above is outlier +struct BenneThreshold { + int queue_size = 10000; // queue size for auto detection + int dim = 30; // above is high dimension + double variance = 100; // above is high concept drift + int outliers_num = 200; // above is many outliers + double outliers_dist = 50; // above is outlier }; -struct param_t -{ - int num_points = 500, dim = 2, num_clusters = 2; - int arr_rate = 0; - bool time_decay = false; - size_t coreset_size = 100; - int seed = 1; - bool fast_source = false; - bool store = true; - - std::string input_file = "datasets/CoverType.txt", output_file = "sesame.out"; - AlgoType algo; - - int num_last_arr = 60, time_window = 6; // also used in timer outlier detection - size_t time_interval = 100; - int num_online_clusters = 10; - - int buf_size = 500, offline_time_window = 100; - - // CF Tree - int max_in_nodes = 3, max_leaf_nodes = 3; - double distance_threshold = 3550; - - // used in DBSCAN - unsigned int min_points = 10; - double epsilon = 50; - - // used in DenStream(unique) - double base = 2, lambda = 1, mu = 7; - double beta = 0.0021; // Also used in DStream, but different meaning - - // EDMStream - double delta = 10; - int num_cache = 100, opt = 2; - - // used in DBStream - double radius = 0.1, min_weight, alpha = 0.998; - size_t clean_interval = 2500; // also used in timer outlier detection - - // used in DStream - double cm = 5.0, cl = 0.8; - double grid_width = 12.0; - - // used in design aspect - bool run_offline = true; // determine whether to run the offline refinement - bool run_eval = true; - bool run_cmm = false, run_pur = true, run_nmi = false; - // bool run_group = true; - int landmark = 1000; // this is the index of landmark point[start from 0](determine - // to process the algorithm from which algorithm) - int sliding = 10; // since we test the count-based sliding window, this is - // the count number - double outlier_distance_threshold = 1000; // the max distance of the incoming point - // to its nearest clusters - double outlier_density_threshold = 100; // the density value of the point to be - // treated as an outlier - double neighbor_distance = 200; // the distance value of the point to judge - // neighborhoods - int outlier_cap = 5; // transfer outlier cluster and true cluster - bool kmeanspp = true; // whether use kmeans++ to initialize the centroids - int k = 2; // number of k in kmeans / kmeanspp - - double delta_grid = 0.2; // The delta parameter used int the grid for guessing the optimum. - int num_samples = 100; // The number of samples used in the grid for guessing - // the optimum. - - size_t num_res = 0; - - BenneObj obj = (BenneObj) 0; - BenneThreshold benne_threshold; - - void Print() - { - std::cout << "algo_id: " << algo << std::endl; - std::cout << "algo: \"" << Name() << "\"" << std::endl; - std::cout << "workload: \"" << Workload() << "\"" << std::endl; - std::cout << "num_points: " << num_points << std::endl; - std::cout << "dim: " << dim << std::endl; - std::cout << "num_clusters: " << num_clusters << std::endl; - std::cout << "arr_rate: " << arr_rate << std::endl; - std::cout << "max_in_nodes: " << max_in_nodes << std::endl; - std::cout << "max_leaf_nodes: " << max_leaf_nodes << std::endl; - std::cout << "distance_threshold: " << distance_threshold << std::endl; - std::cout << "seed: " << seed << std::endl; - std::cout << "coreset_size: " << coreset_size << std::endl; - std::cout << "radius: " << radius << std::endl; - std::cout << "delta: " << delta << std::endl; - std::cout << "beta: " << beta << std::endl; - std::cout << "buf_size: " << buf_size << std::endl; - std::cout << "alpha: " << alpha << std::endl; - std::cout << "lambda: " << lambda << std::endl; - std::cout << "clean_interval: " << clean_interval << std::endl; - std::cout << "min_weight: " << min_weight << std::endl; - std::cout << "base: " << base << std::endl; - std::cout << "cm: " << cm << std::endl; - std::cout << "cl: " << cl << std::endl; - std::cout << "grid_width: " << grid_width << std::endl; - std::cout << "min_points: " << min_points << std::endl; - std::cout << "epsilon: " << epsilon << std::endl; - std::cout << "mu: " << mu << std::endl; - std::cout << "num_last_arr: " << num_last_arr << std::endl; - std::cout << "time_window: " << time_window << std::endl; - std::cout << "num_online_clusters: " << num_online_clusters << std::endl; - std::cout << "delta_grid: " << delta_grid << std::endl; - std::cout << "num_samples: " << num_samples << std::endl; - std::cout << "landmark: " << landmark << std::endl; - std::cout << "sliding: " << sliding << std::endl; - std::cout << "outlier_distance_threshold: " << outlier_distance_threshold << std::endl; - std::cout << "outlier_cap: " << outlier_cap << std::endl; - std::cout << "outlier_density_threshold: " << outlier_density_threshold << std::endl; - std::cout << "neighbor_distance: " << neighbor_distance << std::endl; - std::cout << "k: " << k << std::endl; - std::cout << "run_offline: " << run_offline << std::endl; - std::cout << "obj: " << obj << std::endl; - std::cout << "queue_size_threshold: " << benne_threshold.queue_size << std::endl; - std::cout << "dim_threshold: " << benne_threshold.dim << std::endl; - std::cout << "variance_threshold: " << benne_threshold.variance << std::endl; - std::cout << "outliers_num_threshold: " << benne_threshold.outliers_num << std::endl; - std::cout << "outliers_dist_threshold: " << benne_threshold.outliers_dist << std::endl; - } - std::string Workload() { return std::filesystem::path(input_file).stem(); } - std::string Name() { - if (algo == BenneType) { - return std::string(algo_names[algo]) + benne_suffix[obj]; - } - return algo_names[algo]; +struct param_t { + int num_points = 500, dim = 2, num_clusters = 2; + int arr_rate = 0; + bool time_decay = false; + size_t coreset_size = 100; + int seed = 1; + bool fast_source = false; + bool store = true; + + std::string input_file = "datasets/CoverType.txt", output_file = "sesame.out"; + AlgoType algo; + + int num_last_arr = 60, + time_window = 6; // also used in timer outlier detection + size_t time_interval = 100; + int num_online_clusters = 10; + + int buf_size = 500, offline_time_window = 100; + + // CF Tree + int max_in_nodes = 3, max_leaf_nodes = 3; + double distance_threshold = 3550; + + // used in DBSCAN + unsigned int min_points = 10; + double epsilon = 50; + + // used in DenStream(unique) + double base = 2, lambda = 1, mu = 7; + double beta = 0.0021; // Also used in DStream, but different meaning + + // EDMStream + double delta = 10; + int num_cache = 100, opt = 2; + + // used in DBStream + double radius = 0.1, min_weight, alpha = 0.998; + size_t clean_interval = 2500; // also used in timer outlier detection + + // used in DStream + double cm = 5.0, cl = 0.8; + double grid_width = 12.0; + + // used in design aspect + bool run_offline = true; // determine whether to run the offline refinement + bool run_eval = true; + bool run_cmm = false, run_pur = true, run_nmi = false; + // bool run_group = true; + int landmark = + 1000; // this is the index of landmark point[start from 0](determine + // to process the algorithm from which algorithm) + int sliding = 10; // since we test the count-based sliding window, this is + // the count number + double outlier_distance_threshold = 1000; // the max distance of the incoming + // point to its nearest clusters + double outlier_density_threshold = 100; // the density value of the point to + // be treated as an outlier + double neighbor_distance = 200; // the distance value of the point to judge + // neighborhoods + int outlier_cap = 5; // transfer outlier cluster and true cluster + bool kmeanspp = true; // whether use kmeans++ to initialize the centroids + int k = 2; // number of k in kmeans / kmeanspp + + double delta_grid = + 0.2; // The delta parameter used int the grid for guessing the optimum. + int num_samples = 100; // The number of samples used in the grid for guessing + // the optimum. + + size_t num_res = 0; + + BenneObj obj = (BenneObj)0; + BenneThreshold benne_threshold; + + void Print() { + std::cout << "algo_id: " << algo << std::endl; + std::cout << "algo: \"" << Name() << "\"" << std::endl; + std::cout << "workload: \"" << Workload() << "\"" << std::endl; + std::cout << "num_points: " << num_points << std::endl; + std::cout << "dim: " << dim << std::endl; + std::cout << "num_clusters: " << num_clusters << std::endl; + std::cout << "arr_rate: " << arr_rate << std::endl; + std::cout << "max_in_nodes: " << max_in_nodes << std::endl; + std::cout << "max_leaf_nodes: " << max_leaf_nodes << std::endl; + std::cout << "distance_threshold: " << distance_threshold << std::endl; + std::cout << "seed: " << seed << std::endl; + std::cout << "coreset_size: " << coreset_size << std::endl; + std::cout << "radius: " << radius << std::endl; + std::cout << "delta: " << delta << std::endl; + std::cout << "beta: " << beta << std::endl; + std::cout << "buf_size: " << buf_size << std::endl; + std::cout << "alpha: " << alpha << std::endl; + std::cout << "lambda: " << lambda << std::endl; + std::cout << "clean_interval: " << clean_interval << std::endl; + std::cout << "min_weight: " << min_weight << std::endl; + std::cout << "base: " << base << std::endl; + std::cout << "cm: " << cm << std::endl; + std::cout << "cl: " << cl << std::endl; + std::cout << "grid_width: " << grid_width << std::endl; + std::cout << "min_points: " << min_points << std::endl; + std::cout << "epsilon: " << epsilon << std::endl; + std::cout << "mu: " << mu << std::endl; + std::cout << "num_last_arr: " << num_last_arr << std::endl; + std::cout << "time_window: " << time_window << std::endl; + std::cout << "num_online_clusters: " << num_online_clusters << std::endl; + std::cout << "delta_grid: " << delta_grid << std::endl; + std::cout << "num_samples: " << num_samples << std::endl; + std::cout << "landmark: " << landmark << std::endl; + std::cout << "sliding: " << sliding << std::endl; + std::cout << "outlier_distance_threshold: " << outlier_distance_threshold + << std::endl; + std::cout << "outlier_cap: " << outlier_cap << std::endl; + std::cout << "outlier_density_threshold: " << outlier_density_threshold + << std::endl; + std::cout << "neighbor_distance: " << neighbor_distance << std::endl; + std::cout << "k: " << k << std::endl; + std::cout << "run_offline: " << run_offline << std::endl; + std::cout << "obj: " << obj << std::endl; + std::cout << "queue_size_threshold: " << benne_threshold.queue_size + << std::endl; + std::cout << "dim_threshold: " << benne_threshold.dim << std::endl; + std::cout << "variance_threshold: " << benne_threshold.variance + << std::endl; + std::cout << "outliers_num_threshold: " << benne_threshold.outliers_num + << std::endl; + std::cout << "outliers_dist_threshold: " << benne_threshold.outliers_dist + << std::endl; + } + std::string Workload() { return std::filesystem::path(input_file).stem(); } + std::string Name() { + if (algo == BenneType) { + return std::string(algo_names[algo]) + benne_suffix[obj]; } + return algo_names[algo]; + } }; using SesameParam = param_t; -} // namespace SESAME +} // namespace SESAME -#endif // PARAM_HPP_ \ No newline at end of file +#endif // PARAM_HPP_ \ No newline at end of file diff --git a/include/Algorithm/SlidingWindowClustering.hpp b/include/Algorithm/SlidingWindowClustering.hpp index 48626349..8d6a634b 100644 --- a/include/Algorithm/SlidingWindowClustering.hpp +++ b/include/Algorithm/SlidingWindowClustering.hpp @@ -9,40 +9,36 @@ #include #include -namespace SESAME -{ -void k_means_plus_plus(Random *r, const std::vector> &instance, +namespace SESAME { +void k_means_plus_plus(Random *r, + const std::vector> &instance, int32_t k, std::vector *centers, double *cost); // Class that handles the time stamps of a sliding window. The timestamps start // from 0. -class SlidingWindowHandler -{ +class SlidingWindowHandler { public: - SlidingWindowHandler(const int window_size) - : curr_time_(-1), begin_window_(0), window_size_(window_size) - {} - - // Increase time counter. - void next() - { - curr_time_++; - if (curr_time_ >= window_size_) - { - begin_window_++; - } - assert(curr_time_ - begin_window_ + 1 <= window_size_); + SlidingWindowHandler(const int window_size) + : curr_time_(-1), begin_window_(0), window_size_(window_size) {} + + // Increase time counter. + void next() { + curr_time_++; + if (curr_time_ >= window_size_) { + begin_window_++; } + assert(curr_time_ - begin_window_ + 1 <= window_size_); + } - // Current time. - inline int64_t curr_time() const { return curr_time_; } - // Beginning of the window. - inline int64_t begin_window() const { return begin_window_; } + // Current time. + inline int64_t curr_time() const { return curr_time_; } + // Beginning of the window. + inline int64_t begin_window() const { return begin_window_; } private: - int64_t curr_time_; - int64_t begin_window_; - const int32_t window_size_; + int64_t curr_time_; + int64_t begin_window_; + const int32_t window_size_; }; // This class implements an estimate of the total weight inserted after a @@ -50,675 +46,619 @@ class SlidingWindowHandler // insertion are inserted in increasing order. This can be used to keep track of // the number of items inserted in a bucket after a certain tiem or to keep // track of the total cost of the bucket. -class ApproxTimeCountKeeper -{ +class ApproxTimeCountKeeper { public: - // Epsilon is the approximation error allowed (i.e., we allow 1+epsilon). - explicit ApproxTimeCountKeeper(double epsilon) : epsilon_(epsilon) { assert(epsilon_ > 0); } - ~ApproxTimeCountKeeper() {} - - // Increase the count by 'how_much' at time 'time'. - void increase_total(const int64_t time, const double how_much) - { - for (auto &time_count : time_counts_) - { - assert(time_count.first < time); - time_count.second += how_much; - } - - time_counts_.push_back(std::make_pair(time, how_much)); - - auto it = time_counts_.begin(); - // Compact indices if they are not too different. - while (true) - { - auto it2 = it; - // Move twice ahead if possible. - if (it2 == time_counts_.end()) - { - break; - } - it2++; - if (it2 == time_counts_.end()) - { - break; - } - auto it_middle = it2; - it2++; - if (it2 == time_counts_.end()) - { - break; - } - // Check if need to remove middle - if (it->second <= (1.0 + epsilon_) * it2->second) - { - it_middle = time_counts_.erase(it_middle); - it = it_middle; - } - else - { - ++it; - } - } + // Epsilon is the approximation error allowed (i.e., we allow 1+epsilon). + explicit ApproxTimeCountKeeper(double epsilon) : epsilon_(epsilon) { + assert(epsilon_ > 0); + } + ~ApproxTimeCountKeeper() {} + + // Increase the count by 'how_much' at time 'time'. + void increase_total(const int64_t time, const double how_much) { + for (auto &time_count : time_counts_) { + assert(time_count.first < time); + time_count.second += how_much; } - // Returns a 1+epsilon estimate of the total weight added to the counter after - // begin_time. - double total_after_time(const int64_t begin_time) const - { - double count = time_counts_.front().second; - for (const auto &time_count : time_counts_) - { - if (time_count.first <= begin_time) - { - count = time_count.second; - } - } - return count; + time_counts_.push_back(std::make_pair(time, how_much)); + + auto it = time_counts_.begin(); + // Compact indices if they are not too different. + while (true) { + auto it2 = it; + // Move twice ahead if possible. + if (it2 == time_counts_.end()) { + break; + } + it2++; + if (it2 == time_counts_.end()) { + break; + } + auto it_middle = it2; + it2++; + if (it2 == time_counts_.end()) { + break; + } + // Check if need to remove middle + if (it->second <= (1.0 + epsilon_) * it2->second) { + it_middle = time_counts_.erase(it_middle); + it = it_middle; + } else { + ++it; + } + } + } + + // Returns a 1+epsilon estimate of the total weight added to the counter after + // begin_time. + double total_after_time(const int64_t begin_time) const { + double count = time_counts_.front().second; + for (const auto &time_count : time_counts_) { + if (time_count.first <= begin_time) { + count = time_count.second; + } } + return count; + } - // Returns the count at 0. - double count_at_0() const { return time_counts_.front().second; } + // Returns the count at 0. + double count_at_0() const { return time_counts_.front().second; } private: - std::list> time_counts_; - // Approximation error. - double epsilon_; + std::list> time_counts_; + // Approximation error. + double epsilon_; }; // This is the class that has to be implemented for a sliding window summary // algorithm allowing composable sketches. -class SummaryAlg -{ +class SummaryAlg { public: - // window is the window length - // k is the number of centers - // lambda is the threshold used by the summary - // gen is a random be generator - SummaryAlg(Random *r, int64_t window, int32_t k, double lambda) - : r(r), window_(window), k_(k), lambda_(lambda), is_empty_(true), first_element_time_(-1) - {} - virtual ~SummaryAlg() {} - - // Processes one point. - void process_point(PointPtr point) - { - if (is_empty_) - { - is_empty_ = false; - } - process_point_impl(point); + // window is the window length + // k is the number of centers + // lambda is the threshold used by the summary + // gen is a random be generator + SummaryAlg(Random *r, int64_t window, int32_t k, double lambda) + : r(r), window_(window), k_(k), lambda_(lambda), is_empty_(true), + first_element_time_(-1) {} + virtual ~SummaryAlg() {} + + // Processes one point. + void process_point(PointPtr point) { + if (is_empty_) { + is_empty_ = false; } - - // Resets the summary. - void reset() - { - is_empty_ = true; - first_element_time_ = -1; - reset_impl(); - } - - Random *r; - - // Returns the time of the first element in the stream, - int64_t first_element_time() const { return first_element_time_; } - - // Returns whether the stream is empty, - bool is_empty() const { return is_empty_; } - - // These functions and the ones below must to be implemented by the derived - // class. - // Returns the solution and the cost. - virtual void solution(vector *sol, double *value) const = 0; - - // Output the solution resulting from a composion this summary (as summary - // B -- right summary) with summary_A on the left. - virtual void solution_left_composition(const SummaryAlg &summary_A, int64_t window_begin, - vector *sol, double *value) const = 0; - - // Output the result of composing this summary (as summary A) with and empty - // summary_B on the right. - virtual void solution_right_composition_with_empty(int64_t window_begin, vector *sol, - double *value) const = 0; + process_point_impl(point); + } + + // Resets the summary. + void reset() { + is_empty_ = true; + first_element_time_ = -1; + reset_impl(); + } + + Random *r; + + // Returns the time of the first element in the stream, + int64_t first_element_time() const { return first_element_time_; } + + // Returns whether the stream is empty, + bool is_empty() const { return is_empty_; } + + // These functions and the ones below must to be implemented by the derived + // class. + // Returns the solution and the cost. + virtual void solution(vector *sol, double *value) const = 0; + + // Output the solution resulting from a composion this summary (as summary + // B -- right summary) with summary_A on the left. + virtual void solution_left_composition(const SummaryAlg &summary_A, + int64_t window_begin, + vector *sol, + double *value) const = 0; + + // Output the result of composing this summary (as summary A) with and empty + // summary_B on the right. + virtual void solution_right_composition_with_empty(int64_t window_begin, + vector *sol, + double *value) const = 0; protected: - // Actual implementation of prcess_point - virtual void process_point_impl(PointPtr point) = 0; - // Actual implementation of reset. Notice that the class must be responsible - // for keeping updated the number of items stored when the reset function is - // called. - virtual void reset_impl() = 0; + // Actual implementation of prcess_point + virtual void process_point_impl(PointPtr point) = 0; + // Actual implementation of reset. Notice that the class must be responsible + // for keeping updated the number of items stored when the reset function is + // called. + virtual void reset_impl() = 0; private: - int32_t window_; - int32_t k_; - double lambda_; - bool is_empty_; - int64_t first_element_time_; + int32_t window_; + int32_t k_; + double lambda_; + bool is_empty_; + int64_t first_element_time_; }; // Implementation of the KMeans summary algorithm for sliding window streams // using the Meyerson sketch. -class KMeansSummary : public SummaryAlg -{ +class KMeansSummary : public SummaryAlg { public: - KMeansSummary(Random *r, int64_t window, int32_t k, double optimum_upperbound_guess) - : SummaryAlg(r, window, k, optimum_upperbound_guess) - { - max_sketch_size_ = - std::pow(2, 2 * 2 + 1) * 4. * k * (1. + std::log(window * 3)) * (1.0 + 1. / 0.5); - distance_denominator_ = (optimum_upperbound_guess) / (k * (1. + std::log(window * 3))); - // sketch_number_ = std::log(3 * 1. / error_prob); - k_ = k; - reset_impl(); + KMeansSummary(Random *r, int64_t window, int32_t k, + double optimum_upperbound_guess) + : SummaryAlg(r, window, k, optimum_upperbound_guess) { + max_sketch_size_ = std::pow(2, 2 * 2 + 1) * 4. * k * + (1. + std::log(window * 3)) * (1.0 + 1. / 0.5); + distance_denominator_ = + (optimum_upperbound_guess) / (k * (1. + std::log(window * 3))); + // sketch_number_ = std::log(3 * 1. / error_prob); + k_ = k; + reset_impl(); + } + + virtual ~KMeansSummary() {} + + void solution(vector *centers_solution, + double *cost_solution) const override { + centers_solution->clear(); + double min_cost = std::numeric_limits::max(); + int best_pos = 0; + for (int i = 0; i < kSketchNumber; i++) { + double cost; + vector centers; + bool ok = sketches_.at(i).solution(/*after_time=*/0, ¢ers, &cost); + if (ok && cost < min_cost) { + min_cost = cost; + best_pos = i; + } } - - virtual ~KMeansSummary() {} - - void solution(vector *centers_solution, double *cost_solution) const override - { - centers_solution->clear(); - double min_cost = std::numeric_limits::max(); - int best_pos = 0; - for (int i = 0; i < kSketchNumber; i++) - { - double cost; - vector centers; - bool ok = sketches_.at(i).solution(/*after_time=*/0, ¢ers, &cost); - if (ok && cost < min_cost) - { - min_cost = cost; - best_pos = i; - } - } - sketches_.at(best_pos).solution(/* after_time=*/0, centers_solution, cost_solution); + sketches_.at(best_pos).solution(/* after_time=*/0, centers_solution, + cost_solution); + } + + void solution_left_composition(const SummaryAlg &summary_A, + int64_t window_begin, vector *sol, + double *value) const override { + sol->clear(); + const KMeansSummary &summary_A_type = + dynamic_cast(summary_A); + + double min_cost = std::numeric_limits::max(); + int best_pos = 0; + assert(summary_A_type.sketches_.size() == sketches_.size()); + + for (int i = 0; i < kSketchNumber; i++) { + double cost; + vector centers; + bool ok = sketches_.at(i).compose_left_solution( + summary_A_type.sketches_.at(i), window_begin, ¢ers, &cost); + if (ok && cost < min_cost) { + min_cost = cost; + best_pos = i; + } } - void solution_left_composition(const SummaryAlg &summary_A, int64_t window_begin, - vector *sol, double *value) const override - { - sol->clear(); - const KMeansSummary &summary_A_type = dynamic_cast(summary_A); - - double min_cost = std::numeric_limits::max(); - int best_pos = 0; - assert(summary_A_type.sketches_.size() == sketches_.size()); - - for (int i = 0; i < kSketchNumber; i++) - { - double cost; - vector centers; - bool ok = sketches_.at(i).compose_left_solution(summary_A_type.sketches_.at(i), - window_begin, ¢ers, &cost); - if (ok && cost < min_cost) - { - min_cost = cost; - best_pos = i; - } - } + sketches_.at(best_pos).compose_left_solution( + summary_A_type.sketches_.at(best_pos), window_begin, sol, value); + } + + void + solution_right_composition_with_empty(int64_t window_begin, + vector *centers_solution, + double *cost_solution) const override { + double min_cost = std::numeric_limits::max(); + int best_pos = 0; + for (int i = 0; i < kSketchNumber; i++) { + double cost; + vector centers; + bool ok = sketches_.at(i).solution(window_begin, ¢ers, &cost); + if (ok && cost < min_cost) { + min_cost = cost; + best_pos = i; + } + } + sketches_.at(best_pos).solution(window_begin, centers_solution, + cost_solution); + } + + // This class implements the well-known meyerson sketch for k-means with all + // the bookkeping needed for the sliding window algorithm. This class + // implements a single sketch. + class MeyersonSketch { + public: + // max_num_centers is the maximum number of centers allowed in the sketch. + // denominator_prob is the denominator in the probability of selecting a + // point as center. epsilon_multiplicities is the epsilon factor used in the + // estimate of the multiplicites of the centers. k is the target number of + // centers. gen is a random source. + MeyersonSketch(Random *r, const double max_num_centers, + const double denominator_prob, + const double epsilon_multiplicities, const int32_t k) + : r(r), max_num_centers_(max_num_centers), + denominator_prob_(denominator_prob), + epsilon_multiplicities_(epsilon_multiplicities), k_(k), + failed_sketch_(false) {} + // Notice how the number of items stored is updated by the destructor. + virtual ~MeyersonSketch() {} + + // Copy operator. Handles the number of items stored. + MeyersonSketch(MeyersonSketch const &other) { + r = other.r; + max_num_centers_ = other.max_num_centers_; + denominator_prob_ = other.denominator_prob_; + epsilon_multiplicities_ = other.epsilon_multiplicities_; + k_ = other.k_; + // gen_ = other.gen_; + failed_sketch_ = other.failed_sketch_; + centers_ = other.centers_; + // ITEMS_STORED += centers_.size(); + multiplicities_ = other.multiplicities_; + costs_sum_dist_ = other.costs_sum_dist_; + costs_sum_sq_dist_ = other.costs_sum_sq_dist_; + } - sketches_.at(best_pos).compose_left_solution(summary_A_type.sketches_.at(best_pos), - window_begin, sol, value); + MeyersonSketch &operator=(MeyersonSketch const &other) { + r = other.r; + max_num_centers_ = other.max_num_centers_; + denominator_prob_ = other.denominator_prob_; + epsilon_multiplicities_ = other.epsilon_multiplicities_; + k_ = other.k_; + // gen_ = other.gen_; + failed_sketch_ = other.failed_sketch_; + // The previous number of centers is freed. + // ITEMS_STORED -= centers_.size(); + centers_ = other.centers_; + // ITEMS_STORED += centers_.size(); + multiplicities_ = other.multiplicities_; + costs_sum_dist_ = other.costs_sum_dist_; + costs_sum_sq_dist_ = other.costs_sum_sq_dist_; + return *this; } - void solution_right_composition_with_empty(int64_t window_begin, - vector *centers_solution, - double *cost_solution) const override - { - double min_cost = std::numeric_limits::max(); - int best_pos = 0; - for (int i = 0; i < kSketchNumber; i++) - { - double cost; - vector centers; - bool ok = sketches_.at(i).solution(window_begin, ¢ers, &cost); - if (ok && cost < min_cost) - { - min_cost = cost; - best_pos = i; - } + // Add a point + void add_point(const PointPtr &point) { + if (failed_sketch_) { + return; + } + if (centers_.empty()) { + create_center(point); + return; + } + double min_distance = std::numeric_limits::max(); + int64_t best_center; + for (int i = 0; i < centers_.size(); i++) { + double d = point->L2Dist(centers_[i]); + if (d < min_distance) { + best_center = i; + min_distance = d; } - sketches_.at(best_pos).solution(window_begin, centers_solution, cost_solution); + } + bool open_new = r->bernoulli( + std::min(1.0, std::pow(min_distance, 2) / denominator_prob_)); + if (open_new) { + create_center(point); + } else { + add_point_to_center(point, best_center, min_distance); + } } - // This class implements the well-known meyerson sketch for k-means with all the - // bookkeping needed for the sliding window algorithm. This class implements a - // single sketch. - class MeyersonSketch - { - public: - // max_num_centers is the maximum number of centers allowed in the sketch. - // denominator_prob is the denominator in the probability of selecting a point - // as center. - // epsilon_multiplicities is the epsilon factor used in the estimate of the - // multiplicites of the centers. k is the target number of centers. gen is a - // random source. - MeyersonSketch(Random *r, const double max_num_centers, const double denominator_prob, - const double epsilon_multiplicities, const int32_t k) - : r(r), - max_num_centers_(max_num_centers), - denominator_prob_(denominator_prob), - epsilon_multiplicities_(epsilon_multiplicities), - k_(k), - failed_sketch_(false) - {} - // Notice how the number of items stored is updated by the destructor. - virtual ~MeyersonSketch() {} - - // Copy operator. Handles the number of items stored. - MeyersonSketch(MeyersonSketch const &other) - { - r = other.r; - max_num_centers_ = other.max_num_centers_; - denominator_prob_ = other.denominator_prob_; - epsilon_multiplicities_ = other.epsilon_multiplicities_; - k_ = other.k_; - // gen_ = other.gen_; - failed_sketch_ = other.failed_sketch_; - centers_ = other.centers_; - // ITEMS_STORED += centers_.size(); - multiplicities_ = other.multiplicities_; - costs_sum_dist_ = other.costs_sum_dist_; - costs_sum_sq_dist_ = other.costs_sum_sq_dist_; - } + // Returns the estimate of the multiplicities of the centers assigned after + // a certain time. + void weighted_centers(const int after_time, std::vector *centers, + std::vector *weights) { + centers->clear(); + weights->clear(); + assert(multiplicities_.size() == centers_.size()); + centers->reserve(multiplicities_.size()); + weights->reserve(multiplicities_.size()); + for (int i = 0; i < multiplicities_.size(); i++) { + centers->push_back(centers_[i]); + weights->push_back(multiplicities_[i].total_after_time(after_time)); + } + } - MeyersonSketch &operator=(MeyersonSketch const &other) - { - r = other.r; - max_num_centers_ = other.max_num_centers_; - denominator_prob_ = other.denominator_prob_; - epsilon_multiplicities_ = other.epsilon_multiplicities_; - k_ = other.k_; - // gen_ = other.gen_; - failed_sketch_ = other.failed_sketch_; - // The previous number of centers is freed. - // ITEMS_STORED -= centers_.size(); - centers_ = other.centers_; - // ITEMS_STORED += centers_.size(); - multiplicities_ = other.multiplicities_; - costs_sum_dist_ = other.costs_sum_dist_; - costs_sum_sq_dist_ = other.costs_sum_sq_dist_; - return *this; + // Computes a solution of the state of the Meyerson Sketch after time + // "after_time". + bool solution(const int after_time, std::vector *centers, + double *cost) const { + centers->clear(); + *cost = 0.0; + + if (failed_sketch_) { + centers = nullptr; + *cost = std::numeric_limits::max(); + return false; + } + + if (after_time == 0 && precomputed_cost.has_value()) { + // No need for recomputation. + assert(precomputed_solution.has_value()); + *centers = precomputed_solution.value(); + *cost = precomputed_cost.value(); + return true; + } + + // Create weighted instance. + std::vector> instance; + assert(multiplicities_.size() == centers_.size()); + assert(multiplicities_.size() == costs_sum_dist_.size()); + assert(multiplicities_.size() == costs_sum_sq_dist_.size()); + instance.reserve(multiplicities_.size()); + for (int i = 0; i < multiplicities_.size(); i++) { + instance.push_back(std::make_pair( + centers_[i], multiplicities_[i].total_after_time(after_time))); + } + double cost_instance = 0; + + // Solve k-means. + std::vector pos_centers; + k_means_plus_plus(r, instance, k_, &pos_centers, &cost_instance); + for (const auto &pos : pos_centers) { + assert(pos < instance.size()); + centers->push_back(instance.at(pos).first); + } + // Adding the cost of the sketch itset (sum of squared distances) + for (int i = 0; i < costs_sum_sq_dist_.size(); i++) { + cost_instance += costs_sum_sq_dist_.at(i).total_after_time(after_time); + } + // Adding 2*dist_center_to_sketch(sum dist to sketch center) + for (int i = 0; i < centers_.size(); i++) { + double min_distance = std::numeric_limits::max(); + for (const auto ¢er : *centers) { + min_distance = std::min(min_distance, centers_[i]->L2Dist(center)); } - - // Add a point - void add_point(const PointPtr &point) - { - if (failed_sketch_) - { - return; - } - if (centers_.empty()) - { - create_center(point); - return; - } - double min_distance = std::numeric_limits::max(); - int64_t best_center; - for (int i = 0; i < centers_.size(); i++) - { - double d = point->L2Dist(centers_[i]); - if (d < min_distance) - { - best_center = i; - min_distance = d; - } - } - bool open_new = - r->bernoulli(std::min(1.0, std::pow(min_distance, 2) / denominator_prob_)); - if (open_new) - { - create_center(point); - } - else - { - add_point_to_center(point, best_center, min_distance); - } + cost_instance += 2.0 * min_distance * + costs_sum_dist_.at(i).total_after_time(after_time); + } + *cost = cost_instance; + + // Update the precomputed solution. + if (after_time == 0) { + precomputed_solution.reset(); + precomputed_solution.emplace(*centers); + precomputed_cost = *cost; + last_precomputed_0_multiplicities_.clear(); + for (const auto &mult : multiplicities_) { + last_precomputed_0_multiplicities_.push_back(mult.count_at_0()); } - - // Returns the estimate of the multiplicities of the centers assigned after a - // certain time. - void weighted_centers(const int after_time, std::vector *centers, - std::vector *weights) - { - centers->clear(); - weights->clear(); - assert(multiplicities_.size() == centers_.size()); - centers->reserve(multiplicities_.size()); - weights->reserve(multiplicities_.size()); - for (int i = 0; i < multiplicities_.size(); i++) - { - centers->push_back(centers_[i]); - weights->push_back(multiplicities_[i].total_after_time(after_time)); - } + last_precomputed_0_costs_sum_dist_.clear(); + for (const auto &mult : costs_sum_dist_) { + last_precomputed_0_costs_sum_dist_.push_back(mult.count_at_0()); } - - // Computes a solution of the state of the Meyerson Sketch after time - // "after_time". - bool solution(const int after_time, std::vector *centers, double *cost) const - { - centers->clear(); - *cost = 0.0; - - if (failed_sketch_) - { - centers = nullptr; - *cost = std::numeric_limits::max(); - return false; - } - - if (after_time == 0 && precomputed_cost.has_value()) - { - // No need for recomputation. - assert(precomputed_solution.has_value()); - *centers = precomputed_solution.value(); - *cost = precomputed_cost.value(); - return true; - } - - // Create weighted instance. - std::vector> instance; - assert(multiplicities_.size() == centers_.size()); - assert(multiplicities_.size() == costs_sum_dist_.size()); - assert(multiplicities_.size() == costs_sum_sq_dist_.size()); - instance.reserve(multiplicities_.size()); - for (int i = 0; i < multiplicities_.size(); i++) - { - instance.push_back( - std::make_pair(centers_[i], multiplicities_[i].total_after_time(after_time))); - } - double cost_instance = 0; - - // Solve k-means. - std::vector pos_centers; - k_means_plus_plus(r, instance, k_, &pos_centers, &cost_instance); - for (const auto &pos : pos_centers) - { - assert(pos < instance.size()); - centers->push_back(instance.at(pos).first); - } - // Adding the cost of the sketch itset (sum of squared distances) - for (int i = 0; i < costs_sum_sq_dist_.size(); i++) - { - cost_instance += costs_sum_sq_dist_.at(i).total_after_time(after_time); - } - // Adding 2*dist_center_to_sketch(sum dist to sketch center) - for (int i = 0; i < centers_.size(); i++) - { - double min_distance = std::numeric_limits::max(); - for (const auto ¢er : *centers) - { - min_distance = std::min(min_distance, centers_[i]->L2Dist(center)); - } - cost_instance += - 2.0 * min_distance * costs_sum_dist_.at(i).total_after_time(after_time); - } - *cost = cost_instance; - - // Update the precomputed solution. - if (after_time == 0) - { - precomputed_solution.reset(); - precomputed_solution.emplace(*centers); - precomputed_cost = *cost; - last_precomputed_0_multiplicities_.clear(); - for (const auto &mult : multiplicities_) - { - last_precomputed_0_multiplicities_.push_back(mult.count_at_0()); - } - last_precomputed_0_costs_sum_dist_.clear(); - for (const auto &mult : costs_sum_dist_) - { - last_precomputed_0_costs_sum_dist_.push_back(mult.count_at_0()); - } - last_precomputed_0_costs_sum_sq_dist_.clear(); - for (const auto &mult : costs_sum_sq_dist_) - { - last_precomputed_0_costs_sum_sq_dist_.push_back(mult.count_at_0()); - } - } - - return true; + last_precomputed_0_costs_sum_sq_dist_.clear(); + for (const auto &mult : costs_sum_sq_dist_) { + last_precomputed_0_costs_sum_sq_dist_.push_back(mult.count_at_0()); } + } - // Compose this sketch as summary_B with summary_A to the left. - bool compose_left_solution(const MeyersonSketch &summary_A, int64_t window_begin, - std::vector *centers, double *cost) const - { - centers->clear(); - if (failed_sketch_ || summary_A.failed_sketch_) - { - centers = nullptr; - *cost = std::numeric_limits::max(); - return false; - } - - std::vector> instance; - - // Add all centers in B - assert(summary_A.multiplicities_.size() == summary_A.centers_.size()); - assert(summary_A.multiplicities_.size() == summary_A.costs_sum_dist_.size()); - assert(summary_A.multiplicities_.size() == summary_A.costs_sum_sq_dist_.size()); - instance.reserve(summary_A.multiplicities_.size() + multiplicities_.size()); - for (int i = 0; i < summary_A.multiplicities_.size(); i++) - { - instance.push_back( - std::make_pair(summary_A.centers_.at(i), - summary_A.multiplicities_.at(i).total_after_time(window_begin))); - } - - assert(multiplicities_.size() == centers_.size()); - assert(multiplicities_.size() == costs_sum_dist_.size()); - assert(multiplicities_.size() == costs_sum_sq_dist_.size()); - instance.reserve(multiplicities_.size()); - for (int i = 0; i < multiplicities_.size(); i++) - { - instance.push_back( - std::make_pair(centers_.at(i), multiplicities_.at(i).total_after_time(0))); - } - double cost_instance = 0; - - std::vector pos_centers; - k_means_plus_plus(r, instance, k_, &pos_centers, &cost_instance); - for (const auto &pos : pos_centers) - { - assert(pos < instance.size()); - centers->push_back(instance.at(pos).first); - } - // Adding the cost of the sketch itself - for (int i = 0; i < summary_A.costs_sum_sq_dist_.size(); i++) - { - cost_instance += summary_A.costs_sum_sq_dist_.at(i).total_after_time(window_begin); - } - for (int i = 0; i < costs_sum_sq_dist_.size(); i++) - { - cost_instance += costs_sum_sq_dist_.at(i).total_after_time(0); - } - // Adding the cost of 2*d(sum dist) - for (int i = 0; i < centers_.size(); i++) - { - double min_distance = std::numeric_limits::max(); - for (const auto ¢er : *centers) - { - min_distance = std::min(min_distance, centers_[i]->L2Dist(center)); - } - cost_instance += 2.0 * min_distance * costs_sum_dist_.at(i).total_after_time(0); - } - for (int i = 0; i < summary_A.centers_.size(); i++) - { - double min_distance = std::numeric_limits::max(); - for (const auto ¢er : *centers) - { - min_distance = std::min(min_distance, summary_A.centers_[i]->L2Dist(center)); - } - cost_instance += 2.0 * min_distance * - summary_A.costs_sum_dist_.at(i).total_after_time(window_begin); - } - *cost = cost_instance; - return true; - } + return true; + } - private: - // Initialize a new center - void create_center(const PointPtr &point) - { - if (failed_sketch_ || centers_.size() == max_num_centers_) - { - failed_sketch_ = true; - return; - } - // Invalidate precomputed solution. - precomputed_cost.reset(); - - // ++ITEMS_STORED; - centers_.push_back(point); - multiplicities_.push_back(ApproxTimeCountKeeper(epsilon_multiplicities_)); - multiplicities_.back().increase_total(point->index, 1); - costs_sum_dist_.push_back(ApproxTimeCountKeeper(epsilon_multiplicities_)); - costs_sum_dist_.back().increase_total(point->index, 0.0); - costs_sum_sq_dist_.push_back(ApproxTimeCountKeeper(epsilon_multiplicities_)); - costs_sum_sq_dist_.back().increase_total(point->index, 0.0); + // Compose this sketch as summary_B with summary_A to the left. + bool compose_left_solution(const MeyersonSketch &summary_A, + int64_t window_begin, + std::vector *centers, + double *cost) const { + centers->clear(); + if (failed_sketch_ || summary_A.failed_sketch_) { + centers = nullptr; + *cost = std::numeric_limits::max(); + return false; + } + + std::vector> instance; + + // Add all centers in B + assert(summary_A.multiplicities_.size() == summary_A.centers_.size()); + assert(summary_A.multiplicities_.size() == + summary_A.costs_sum_dist_.size()); + assert(summary_A.multiplicities_.size() == + summary_A.costs_sum_sq_dist_.size()); + instance.reserve(summary_A.multiplicities_.size() + + multiplicities_.size()); + for (int i = 0; i < summary_A.multiplicities_.size(); i++) { + instance.push_back(std::make_pair( + summary_A.centers_.at(i), + summary_A.multiplicities_.at(i).total_after_time(window_begin))); + } + + assert(multiplicities_.size() == centers_.size()); + assert(multiplicities_.size() == costs_sum_dist_.size()); + assert(multiplicities_.size() == costs_sum_sq_dist_.size()); + instance.reserve(multiplicities_.size()); + for (int i = 0; i < multiplicities_.size(); i++) { + instance.push_back(std::make_pair( + centers_.at(i), multiplicities_.at(i).total_after_time(0))); + } + double cost_instance = 0; + + std::vector pos_centers; + k_means_plus_plus(r, instance, k_, &pos_centers, &cost_instance); + for (const auto &pos : pos_centers) { + assert(pos < instance.size()); + centers->push_back(instance.at(pos).first); + } + // Adding the cost of the sketch itself + for (int i = 0; i < summary_A.costs_sum_sq_dist_.size(); i++) { + cost_instance += + summary_A.costs_sum_sq_dist_.at(i).total_after_time(window_begin); + } + for (int i = 0; i < costs_sum_sq_dist_.size(); i++) { + cost_instance += costs_sum_sq_dist_.at(i).total_after_time(0); + } + // Adding the cost of 2*d(sum dist) + for (int i = 0; i < centers_.size(); i++) { + double min_distance = std::numeric_limits::max(); + for (const auto ¢er : *centers) { + min_distance = std::min(min_distance, centers_[i]->L2Dist(center)); } - - // Assign a point to a center. - void add_point_to_center(const PointPtr &point, const int32_t center_position, - const double distance) - { - multiplicities_[center_position].increase_total(point->index, 1); - costs_sum_dist_[center_position].increase_total(point->index, distance); - costs_sum_sq_dist_[center_position].increase_total(point->index, - ::std::pow(distance, 2)); - - if (precomputed_cost.has_value()) - { - if (multiplicities_.at(center_position).count_at_0() > - last_precomputed_0_multiplicities_.at(center_position) * - (1.0 + kErrorMarginPrecomputedSolution) || - costs_sum_dist_.at(center_position).count_at_0() > - last_precomputed_0_costs_sum_dist_.at(center_position) * - (1.0 + kErrorMarginPrecomputedSolution) || - costs_sum_sq_dist_.at(center_position).count_at_0() > - last_precomputed_0_costs_sum_sq_dist_.at(center_position) * - (1.0 + kErrorMarginPrecomputedSolution)) - { - // Invalidate precomputed solution. - precomputed_cost.reset(); - } - } + cost_instance += + 2.0 * min_distance * costs_sum_dist_.at(i).total_after_time(0); + } + for (int i = 0; i < summary_A.centers_.size(); i++) { + double min_distance = std::numeric_limits::max(); + for (const auto ¢er : *centers) { + min_distance = + std::min(min_distance, summary_A.centers_[i]->L2Dist(center)); } + cost_instance += + 2.0 * min_distance * + summary_A.costs_sum_dist_.at(i).total_after_time(window_begin); + } + *cost = cost_instance; + return true; + } - double max_num_centers_; - // This is the factor used in p = min(dist / denominator_prob_, 1) for - // inserting a point. - double denominator_prob_; - double epsilon_multiplicities_; - int32_t k_; - bool failed_sketch_; - vector centers_; - vector multiplicities_; - vector costs_sum_dist_; - vector costs_sum_sq_dist_; - Random *r; - - // Used to avoid recomputation if the solution has not significantly changed. - mutable std::optional precomputed_cost; - mutable std::optional> precomputed_solution; - mutable vector last_precomputed_0_multiplicities_; - mutable vector last_precomputed_0_costs_sum_dist_; - mutable vector last_precomputed_0_costs_sum_sq_dist_; - - // Margin of error in the precomputed solution. - static constexpr double kErrorMarginPrecomputedSolution = 0.05; - }; + private: + // Initialize a new center + void create_center(const PointPtr &point) { + if (failed_sketch_ || centers_.size() == max_num_centers_) { + failed_sketch_ = true; + return; + } + // Invalidate precomputed solution. + precomputed_cost.reset(); + + // ++ITEMS_STORED; + centers_.push_back(point); + multiplicities_.push_back(ApproxTimeCountKeeper(epsilon_multiplicities_)); + multiplicities_.back().increase_total(point->index, 1); + costs_sum_dist_.push_back(ApproxTimeCountKeeper(epsilon_multiplicities_)); + costs_sum_dist_.back().increase_total(point->index, 0.0); + costs_sum_sq_dist_.push_back( + ApproxTimeCountKeeper(epsilon_multiplicities_)); + costs_sum_sq_dist_.back().increase_total(point->index, 0.0); + } -protected: - void process_point_impl(PointPtr point) override - { - for (auto &sketch : sketches_) - { - sketch.add_point(point); + // Assign a point to a center. + void add_point_to_center(const PointPtr &point, + const int32_t center_position, + const double distance) { + multiplicities_[center_position].increase_total(point->index, 1); + costs_sum_dist_[center_position].increase_total(point->index, distance); + costs_sum_sq_dist_[center_position].increase_total( + point->index, ::std::pow(distance, 2)); + + if (precomputed_cost.has_value()) { + if (multiplicities_.at(center_position).count_at_0() > + last_precomputed_0_multiplicities_.at(center_position) * + (1.0 + kErrorMarginPrecomputedSolution) || + costs_sum_dist_.at(center_position).count_at_0() > + last_precomputed_0_costs_sum_dist_.at(center_position) * + (1.0 + kErrorMarginPrecomputedSolution) || + costs_sum_sq_dist_.at(center_position).count_at_0() > + last_precomputed_0_costs_sum_sq_dist_.at(center_position) * + (1.0 + kErrorMarginPrecomputedSolution)) { + // Invalidate precomputed solution. + precomputed_cost.reset(); } + } } - void reset_impl() override - { - sketches_.clear(); - sketches_.reserve(kSketchNumber); - for (int i = 0; i < kSketchNumber; i++) - { - MeyersonSketch sketch(r, max_sketch_size_, distance_denominator_, kEpsilonMult, k_); - sketches_.push_back(sketch); - } + double max_num_centers_; + // This is the factor used in p = min(dist / denominator_prob_, 1) for + // inserting a point. + double denominator_prob_; + double epsilon_multiplicities_; + int32_t k_; + bool failed_sketch_; + vector centers_; + vector multiplicities_; + vector costs_sum_dist_; + vector costs_sum_sq_dist_; + Random *r; + + // Used to avoid recomputation if the solution has not significantly + // changed. + mutable std::optional precomputed_cost; + mutable std::optional> precomputed_solution; + mutable vector last_precomputed_0_multiplicities_; + mutable vector last_precomputed_0_costs_sum_dist_; + mutable vector last_precomputed_0_costs_sum_sq_dist_; + + // Margin of error in the precomputed solution. + static constexpr double kErrorMarginPrecomputedSolution = 0.05; + }; + +protected: + void process_point_impl(PointPtr point) override { + for (auto &sketch : sketches_) { + sketch.add_point(point); } + } + + void reset_impl() override { + sketches_.clear(); + sketches_.reserve(kSketchNumber); + for (int i = 0; i < kSketchNumber; i++) { + MeyersonSketch sketch(r, max_sketch_size_, distance_denominator_, + kEpsilonMult, k_); + sketches_.push_back(sketch); + } + } private: - int32_t max_sketch_size_; - double distance_denominator_; - int32_t k_; + int32_t max_sketch_size_; + double distance_denominator_; + int32_t k_; - std::vector sketches_; + std::vector sketches_; - // Epsilon factor in the estimation of the multiplicities. - static constexpr double kEpsilonMult = 0.01; - // Number of sketches used. As stated in the paper, we use a single sketch as - // in practice is sufficient to get good results. - static constexpr int32_t kSketchNumber = 1; + // Epsilon factor in the estimation of the multiplicities. + static constexpr double kEpsilonMult = 0.01; + // Number of sketches used. As stated in the paper, we use a single sketch as + // in practice is sufficient to get good results. + static constexpr int32_t kSketchNumber = 1; }; // This class implements the procedure to update the pair of summaries // associated with a threshold lambda. -template -class OneThresholdsPairSummaryAlg -{ +template class OneThresholdsPairSummaryAlg { public: - // window is the window size. - // k is the number of clusters - // lambda is the threshold for the. - OneThresholdsPairSummaryAlg(Random *r, int64_t window, int32_t k, double lambda) - : r(r), - window_(window), - k_(k), - lambda_(lambda), - summary_A_(r, window_, k_, lambda_), - summary_B_(r, window_, k_, lambda_) - { - assert(lambda >= 0); + // window is the window size. + // k is the number of clusters + // lambda is the threshold for the. + OneThresholdsPairSummaryAlg(Random *r, int64_t window, int32_t k, + double lambda) + : r(r), window_(window), k_(k), lambda_(lambda), + summary_A_(r, window_, k_, lambda_), + summary_B_(r, window_, k_, lambda_) { + assert(lambda >= 0); + } + + virtual ~OneThresholdsPairSummaryAlg() {} + + void process_point(PointPtr point) { + // Add the point to summary B. + SummaryAlg copy_B = summary_B_; + summary_B_.process_point(point); + vector unused_centers_solution; + double value; + summary_B_.solution(&unused_centers_solution, &value); + // If the value of summary B is < lambda then it is kept in B. Otherwise the + // new element is the start of a new summary B and summary A is assigned the + // old value of B. + if (value >= lambda_) { + summary_A_ = copy_B; + summary_B_.reset(); + summary_B_.process_point(point); + first_element_B_ = point; } + } - virtual ~OneThresholdsPairSummaryAlg() {} - - void process_point(PointPtr point) - { - // Add the point to summary B. - SummaryAlg copy_B = summary_B_; - summary_B_.process_point(point); - vector unused_centers_solution; - double value; - summary_B_.solution(&unused_centers_solution, &value); - // If the value of summary B is < lambda then it is kept in B. Otherwise the - // new element is the start of a new summary B and summary A is assigned the - // old value of B. - if (value >= lambda_) - { - summary_A_ = copy_B; - summary_B_.reset(); - summary_B_.process_point(point); - first_element_B_ = point; - } - } + const SummaryAlg *get_summary_A() const { return &summary_A_; } - const SummaryAlg *get_summary_A() const { return &summary_A_; } - - const SummaryAlg *get_summary_B() const { return &summary_B_; } + const SummaryAlg *get_summary_B() const { return &summary_B_; } private: - const int32_t window_; - const int32_t k_; - const double lambda_; - SummaryAlg summary_A_; - SummaryAlg summary_B_; - PointPtr first_element_B_; - Random *r; + const int32_t window_; + const int32_t k_; + const double lambda_; + SummaryAlg summary_A_; + SummaryAlg summary_B_; + PointPtr first_element_B_; + Random *r; }; // This runs the algorithmic framework for our sliding window algorithm. @@ -727,182 +667,154 @@ class OneThresholdsPairSummaryAlg // window which is the window size // begin grid which is used for the begin of the grid // end grid which is the end of the grid. -template -class FrameworkAlg -{ +template class FrameworkAlg { public: - FrameworkAlg(Random *r, int64_t window, int32_t k, double delta, double begin_grid, - double end_grid) - : r(r), - window_(window), - k_(k), - delta_(delta), - begin_grid_(begin_grid), - end_grid_(end_grid), - window_handler_(window) - { - assert(delta > 0); - assert(begin_grid < end_grid); - assert(window_ > 1); - - // Initializes the summary thresholds. - double lambda = begin_grid_; - while (lambda <= end_grid_ * (1.0 + delta)) - { - threshold_algs_.push_back( - OneThresholdsPairSummaryAlg(r, window_, k_, lambda)); - lambda *= (1 + delta); - } + FrameworkAlg(Random *r, int64_t window, int32_t k, double delta, + double begin_grid, double end_grid) + : r(r), window_(window), k_(k), delta_(delta), begin_grid_(begin_grid), + end_grid_(end_grid), window_handler_(window) { + assert(delta > 0); + assert(begin_grid < end_grid); + assert(window_ > 1); + + // Initializes the summary thresholds. + double lambda = begin_grid_; + while (lambda <= end_grid_ * (1.0 + delta)) { + threshold_algs_.push_back( + OneThresholdsPairSummaryAlg(r, window_, k_, lambda)); + lambda *= (1 + delta); } - // Disllow copy and assign. - FrameworkAlg(FrameworkAlg const &) = delete; - void operator=(FrameworkAlg const &x) = delete; - - // Processes a point. - void process_point(PointPtr point) - { - window_handler_.next(); - for (auto &threshold_alg : threshold_algs_) - { - threshold_alg.process_point(point); - } + } + // Disllow copy and assign. + FrameworkAlg(FrameworkAlg const &) = delete; + void operator=(FrameworkAlg const &x) = delete; + + // Processes a point. + void process_point(PointPtr point) { + window_handler_.next(); + for (auto &threshold_alg : threshold_algs_) { + threshold_alg.process_point(point); } - - // Outputs the solution and its cost. - void solution(vector *solution, double *value) - { - // Contrary to the simplified version in the main body of the paper, the - // algorithm considers any valid solution (i.e. computer over the entire - // window) and outputs the one with the lowest estimate of cost, this only - // improves the results. - // Check if there is B_lambda that is = active_window. - double min_cost = std::numeric_limits::max(); - vector best_solution; - - for (const auto &threshold_alg : threshold_algs_) - { - if (!threshold_alg.get_summary_B()->is_empty() && - threshold_alg.get_summary_B()->first_element_time() == - window_handler_.begin_window()) - { - threshold_alg.get_summary_B()->solution(solution, value); - if (*value < min_cost) - { - min_cost = *value; - best_solution = *solution; - } - } - else if (!threshold_alg.get_summary_B()->is_empty() && - threshold_alg.get_summary_B()->first_element_time() < - window_handler_.begin_window()) - { - threshold_alg.get_summary_B()->solution_right_composition_with_empty( - window_handler_.begin_window(), solution, value); - if (*value < min_cost) - { - min_cost = *value; - best_solution = *solution; - } - } + } + + // Outputs the solution and its cost. + void solution(vector *solution, double *value) { + // Contrary to the simplified version in the main body of the paper, the + // algorithm considers any valid solution (i.e. computer over the entire + // window) and outputs the one with the lowest estimate of cost, this only + // improves the results. + // Check if there is B_lambda that is = active_window. + double min_cost = std::numeric_limits::max(); + vector best_solution; + + for (const auto &threshold_alg : threshold_algs_) { + if (!threshold_alg.get_summary_B()->is_empty() && + threshold_alg.get_summary_B()->first_element_time() == + window_handler_.begin_window()) { + threshold_alg.get_summary_B()->solution(solution, value); + if (*value < min_cost) { + min_cost = *value; + best_solution = *solution; + } + } else if (!threshold_alg.get_summary_B()->is_empty() && + threshold_alg.get_summary_B()->first_element_time() < + window_handler_.begin_window()) { + threshold_alg.get_summary_B()->solution_right_composition_with_empty( + window_handler_.begin_window(), solution, value); + if (*value < min_cost) { + min_cost = *value; + best_solution = *solution; } + } + } - // This is a backup in case the algorithmic guarantess of the summaries - // fails (or if the lower or upper bound are wrong) which is possible in - // practice. It will output the solution of the sketch (subset of the window - // that has longest possible history). - vector oldest_sketch_solution; - double oldest_sketch_cost; - int64_t oldest_begin_first = std::numeric_limits::max(); - - for (const auto &threshold_alg : threshold_algs_) - { - // A is not empty and it is not a subset of W for the smallest lambda. - if (!threshold_alg.get_summary_A()->is_empty() && - threshold_alg.get_summary_A()->first_element_time() < - window_handler_.begin_window()) - { - // Either W is a strict subset of B. - if (threshold_alg.get_summary_B()->first_element_time() < - window_handler_.begin_window()) - { - threshold_alg.get_summary_B()->solution_right_composition_with_empty( - window_handler_.begin_window(), solution, value); - if (*value < min_cost) - { - min_cost = *value; - best_solution = *solution; - } - } - else - { - threshold_alg.get_summary_B()->solution_left_composition( - *threshold_alg.get_summary_A(), window_handler_.begin_window(), solution, - value); - if (*value < min_cost) - { - min_cost = *value; - best_solution = *solution; - } - } - } - else - { // backup for the case with no guarantees. - if (!threshold_alg.get_summary_A()->is_empty() && - threshold_alg.get_summary_A()->first_element_time() < oldest_begin_first) - { - oldest_begin_first = threshold_alg.get_summary_A()->first_element_time(); - threshold_alg.get_summary_B()->solution_left_composition( - *threshold_alg.get_summary_A(), 0, &oldest_sketch_solution, - &oldest_sketch_cost); - } - } + // This is a backup in case the algorithmic guarantess of the summaries + // fails (or if the lower or upper bound are wrong) which is possible in + // practice. It will output the solution of the sketch (subset of the window + // that has longest possible history). + vector oldest_sketch_solution; + double oldest_sketch_cost; + int64_t oldest_begin_first = std::numeric_limits::max(); + + for (const auto &threshold_alg : threshold_algs_) { + // A is not empty and it is not a subset of W for the smallest lambda. + if (!threshold_alg.get_summary_A()->is_empty() && + threshold_alg.get_summary_A()->first_element_time() < + window_handler_.begin_window()) { + // Either W is a strict subset of B. + if (threshold_alg.get_summary_B()->first_element_time() < + window_handler_.begin_window()) { + threshold_alg.get_summary_B()->solution_right_composition_with_empty( + window_handler_.begin_window(), solution, value); + if (*value < min_cost) { + min_cost = *value; + best_solution = *solution; + } + } else { + threshold_alg.get_summary_B()->solution_left_composition( + *threshold_alg.get_summary_A(), window_handler_.begin_window(), + solution, value); + if (*value < min_cost) { + min_cost = *value; + best_solution = *solution; + } } - *solution = best_solution; - *value = min_cost; - if (min_cost < std::numeric_limits::max()) - { - return; + } else { // backup for the case with no guarantees. + if (!threshold_alg.get_summary_A()->is_empty() && + threshold_alg.get_summary_A()->first_element_time() < + oldest_begin_first) { + oldest_begin_first = + threshold_alg.get_summary_A()->first_element_time(); + threshold_alg.get_summary_B()->solution_left_composition( + *threshold_alg.get_summary_A(), 0, &oldest_sketch_solution, + &oldest_sketch_cost); } - // This point is reached in case of failure of the sketches to give approx. - // guarantees or if the upper-lower bounds of the cost are wrong. In this - // case - *solution = oldest_sketch_solution; - *value = oldest_sketch_cost; - assert(oldest_begin_first < std::numeric_limits::max()); + } + } + *solution = best_solution; + *value = min_cost; + if (min_cost < std::numeric_limits::max()) { + return; } + // This point is reached in case of failure of the sketches to give approx. + // guarantees or if the upper-lower bounds of the cost are wrong. In this + // case + *solution = oldest_sketch_solution; + *value = oldest_sketch_cost; + assert(oldest_begin_first < std::numeric_limits::max()); + } private: - Random *r; - const int64_t window_; - const int32_t k_; - const double delta_; - const double begin_grid_; - const double end_grid_; - SlidingWindowHandler window_handler_; - std::vector> threshold_algs_; + Random *r; + const int64_t window_; + const int32_t k_; + const double delta_; + const double begin_grid_; + const double end_grid_; + SlidingWindowHandler window_handler_; + std::vector> threshold_algs_; }; -class SlidingWindowClustering : public Algorithm -{ +class SlidingWindowClustering : public Algorithm { private: - Random r; - std::vector samples; - bool has_sampled = false; - std::shared_ptr> framework; - int count = 0; + Random r; + std::vector samples; + bool has_sampled = false; + std::shared_ptr> framework; + int count = 0; public: - SlidingWindowClustering(param_t &cmd_params); + SlidingWindowClustering(param_t &cmd_params); - ~SlidingWindowClustering(); + ~SlidingWindowClustering(); - void Init() override; + void Init() override; - void RunOnline(PointPtr input) override; + void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + void RunOffline(DataSinkPtr sinkPtr) override; }; -} // namespace SESAME +} // namespace SESAME #endif diff --git a/include/Algorithm/StreamKM.hpp b/include/Algorithm/StreamKM.hpp index 6614b8da..dab896b6 100644 --- a/include/Algorithm/StreamKM.hpp +++ b/include/Algorithm/StreamKM.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by tuidan on 2021/7/21. @@ -13,41 +14,39 @@ #include #include -namespace SESAME -{ +namespace SESAME { -class StreamKMParameter : public SesameParam -{ +class StreamKMParameter : public SesameParam { public: - int windowSize; - int seed; - int num_clusters; + int windowSize; + int seed; + int num_clusters; }; -class StreamKM : public Algorithm -{ +class StreamKM : public Algorithm { public: - StreamKMParameter StreamKMParam; + StreamKMParameter StreamKMParam; - // initialize - LandmarkWindowPtr window; - vector inputs; // buffered inputs. - vector streamingCoreset; // intermediate results. - KMeans km; // used for offline processing. - StreamKM(param_t &cmd_params); + // initialize + LandmarkWindowPtr window; + vector inputs; // buffered inputs. + vector streamingCoreset; // intermediate results. + KMeans km; // used for offline processing. + StreamKM(param_t &cmd_params); - ~StreamKM(); + ~StreamKM(); - void Init() override; + void Init() override; - void RunOnline(PointPtr input) override; + void RunOnline(PointPtr input) override; - void RunOffline(DataSinkPtr sinkPtr) override; + void RunOffline(DataSinkPtr sinkPtr) override; private: - void dumpResults(vector ¢ers, vector> groups, - DataSinkPtr ptr) const; + void dumpResults(vector ¢ers, + vector> groups, + DataSinkPtr ptr) const; }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_SRC_ALGORITHM_STREAMKM_HPP_ +#endif // SESAME_SRC_ALGORITHM_STREAMKM_HPP_ diff --git a/include/Algorithm/WindowModel/DampedWindow.hpp b/include/Algorithm/WindowModel/DampedWindow.hpp index c4944aaa..f6215ce0 100644 --- a/include/Algorithm/WindowModel/DampedWindow.hpp +++ b/include/Algorithm/WindowModel/DampedWindow.hpp @@ -14,20 +14,18 @@ #include #include #include -namespace SESAME -{ +namespace SESAME { class DampedWindow; typedef std::shared_ptr DampedWindowPtr; -class DampedWindow : WindowModel -{ +class DampedWindow : WindowModel { public: - double base; - double lambda; - DampedWindow(double base, double lambda); - double decayFunction(timespec startTime, timespec currentTimestamp) const; - double decayFunction(int startTime, int currentTimestamp) const; + double base; + double lambda; + DampedWindow(double base, double lambda); + double decayFunction(timespec startTime, timespec currentTimestamp) const; + double decayFunction(int startTime, int currentTimestamp) const; }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_WINDOWMODEL_DAMPEDWINDOW_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_WINDOWMODEL_DAMPEDWINDOW_HPP_ diff --git a/include/Algorithm/WindowModel/LandmarkWindow.hpp b/include/Algorithm/WindowModel/LandmarkWindow.hpp index 99d56939..b2a23aee 100644 --- a/include/Algorithm/WindowModel/LandmarkWindow.hpp +++ b/include/Algorithm/WindowModel/LandmarkWindow.hpp @@ -21,165 +21,163 @@ #include #include -namespace SESAME -{ +namespace SESAME { class LandmarkWindow; typedef std::shared_ptr LandmarkWindowPtr; -class LandmarkWindow : WindowModel -{ +class LandmarkWindow : WindowModel { public: - class CoresetTree; - typedef std::shared_ptr CoresetTreePtr; - - class CoresetTree - { - private: - param_t param; - - public: - CoresetTree(const param_t ¶m) : param(param) {} - /** - * initalizes root as a treenode with the union of setA and setB as pointset and centre as - * centre - * @param root - * @param setA - * @param setB - * @param n_1 - * @param n_2 - * @param centre - * @param centreIndex - */ - void constructRoot(TreeNodePtr root, std::vector &setA, - std::vector &setB, int n_1, int n_2, PointPtr centre, - int centreIndex); - - /** - Constructs a coreset of size k from the union of setA and setB - **/ - void unionTreeCoreset(int k, int n_1, int n_2, std::vector &setA, - std::vector &setB, std::vector ¢res); - void freeTree(TreeNodePtr root); - bool treeFinished(TreeNodePtr root); - bool isLeaf(TreeNodePtr node); - - /** - * Computes the target function value of the n points of the treenode. Differs from the - * function "targetFunctionValue" in three things: - * 1. only the centre of the treenode is used as a centre - * 2. works on arrays of pointers instead on arrays of points - * 3. stores the cost in the treenode - * @param node - */ - void treeNodeTargetFunctionValue(TreeNodePtr node); - - /** - selects a leaf node (using the kMeans++ distribution) - **/ - TreeNodePtr selectNode(TreeNodePtr root); - PointPtr chooseCentre(TreeNodePtr node); - double treeNodeCostOfPoint(TreeNodePtr node, PointPtr p); - double treeNodeSplitCost(TreeNodePtr node, PointPtr CenterA, PointPtr CenterB); - void split(TreeNodePtr parent, PointPtr newCentre, int newCentreIndex); - PointPtr determineClosestCentre(PointPtr point, PointPtr centreA, PointPtr centreB); - }; - /** - DataStructure representing a single window - **/ - struct Window - { - int cursize; - std::vector points; - std::vector spillover; - }; + class CoresetTree; + typedef std::shared_ptr CoresetTreePtr; + class CoresetTree { + private: + param_t param; + + public: + CoresetTree(const param_t ¶m) : param(param) {} /** - datastructure for managing all O(log(n)) windows - **/ - struct WindowManager - { - int numberOfWindow; - int maxWindowSize; - std::vector windows; // struct Window *windows; - }; - - WindowManager windowManager; - TimeMeter timerMeter; - CoresetTreePtr tree; - /** - * initialize windows in the window manager. - * @param dim - * @param coreset_size + * initalizes root as a treenode with the union of setA and setB as pointset + * and centre as centre + * @param root + * @param setA + * @param setB + * @param n_1 + * @param n_2 + * @param centre + * @param centreIndex */ - void initWindow(int num); + void constructRoot(TreeNodePtr root, std::vector &setA, + std::vector &setB, int n_1, int n_2, + PointPtr centre, int centreIndex); /** - inserts a single point into the bucketmanager + Constructs a coreset of size k from the union of setA and setB **/ - void insertPoint(PointPtr point); + void unionTreeCoreset(int k, int n_1, int n_2, std::vector &setA, + std::vector &setB, + std::vector ¢res); + void freeTree(TreeNodePtr root); + bool treeFinished(TreeNodePtr root); + bool isLeaf(TreeNodePtr node); /** - It may happen that the manager is not full (since n is not always a power of - 2). In this case we extract the coreset from the manager by computing a - coreset of all nonempty windows - - Case 1: the last bucket is full - => n is a power of 2 and we return the contents of the last bucket - - Case2: the last bucket is not full - => we compute a coreset of all nonempty windows - - this operation should only be called after the streaming process is sourceEnd - **/ - std::vector getCoresetFromManager( - std::vector & - coreset); // https://stackoverflow.com/questions/15704565/efficient-way-to-return-a-stdvector-in-c + * Computes the target function value of the n points of the treenode. + * Differs from the function "targetFunctionValue" in three things: + * 1. only the centre of the treenode is used as a centre + * 2. works on arrays of pointers instead on arrays of points + * 3. stores the cost in the treenode + * @param node + */ + void treeNodeTargetFunctionValue(TreeNodePtr node); /** - Pyramidal time frame: Taking snapshots of online Micro clusters in Clustream - algorithms + selects a leaf node (using the kMeans++ distribution) **/ - - /** - Data Structure representing all orders of snapshots - **/ - SESAME::QueueOrderSnapshot orderSnapShots; - struct PyramidalWindow - { - unsigned int time_interval; // time interval of pyramidal window - unsigned int currentOrder; // the biggest order T of snapshots - }; - PyramidalWindow pyramidalWindow; - - /** - * @Description: init the pyramidal window, pass the value of startTime and - * reset shared_ptr Arrayqueue of snapshots - * @Param: startTime: the start time point of algorithm - * time_interval: the time interval of online pyramidal window frame - * @Return: void - */ - void initPyramidalWindow(unsigned int time_interval); - /** - * @Description: this function is pyramidal window for Clustream, - * which takes and stores the snapshots of micro clusters - * @Param: microClusters: micro clusters' snapshot need to be stored - * elapsedTime: the current elapsed time - * @Return: void - */ - void pyramidalWindowProcess(int elapsedTime, const MicroClusters µClusters); - /** - * @Description: this function stores snapshots into the pyramidal window data - * structure - * @Param: currentOrder: the ith order snapshots stored into - * microClusters: micro clusters' snapshot need to be stored - * elapsedTime: the current elapsed time - * @Return: void - */ - void storeSnapshot(unsigned int currentOrder, const MicroClusters µClusters, - int elapsedTime); - void clearPyramidalWindow(); + TreeNodePtr selectNode(TreeNodePtr root); + PointPtr chooseCentre(TreeNodePtr node); + double treeNodeCostOfPoint(TreeNodePtr node, PointPtr p); + double treeNodeSplitCost(TreeNodePtr node, PointPtr CenterA, + PointPtr CenterB); + void split(TreeNodePtr parent, PointPtr newCentre, int newCentreIndex); + PointPtr determineClosestCentre(PointPtr point, PointPtr centreA, + PointPtr centreB); + }; + /** + DataStructure representing a single window + **/ + struct Window { + int cursize; + std::vector points; + std::vector spillover; + }; + + /** + datastructure for managing all O(log(n)) windows + **/ + struct WindowManager { + int numberOfWindow; + int maxWindowSize; + std::vector windows; // struct Window *windows; + }; + + WindowManager windowManager; + TimeMeter timerMeter; + CoresetTreePtr tree; + /** + * initialize windows in the window manager. + * @param dim + * @param coreset_size + */ + void initWindow(int num); + + /** + inserts a single point into the bucketmanager + **/ + void insertPoint(PointPtr point); + + /** + It may happen that the manager is not full (since n is not always a power of + 2). In this case we extract the coreset from the manager by computing a + coreset of all nonempty windows + + Case 1: the last bucket is full + => n is a power of 2 and we return the contents of the last bucket + + Case2: the last bucket is not full + => we compute a coreset of all nonempty windows + + this operation should only be called after the streaming process is sourceEnd + **/ + std::vector getCoresetFromManager( + std::vector & + coreset); // https://stackoverflow.com/questions/15704565/efficient-way-to-return-a-stdvector-in-c + + /** + Pyramidal time frame: Taking snapshots of online Micro clusters in Clustream + algorithms + **/ + + /** + Data Structure representing all orders of snapshots + **/ + SESAME::QueueOrderSnapshot orderSnapShots; + struct PyramidalWindow { + unsigned int time_interval; // time interval of pyramidal window + unsigned int currentOrder; // the biggest order T of snapshots + }; + PyramidalWindow pyramidalWindow; + + /** + * @Description: init the pyramidal window, pass the value of startTime and + * reset shared_ptr Arrayqueue of snapshots + * @Param: startTime: the start time point of algorithm + * time_interval: the time interval of online pyramidal window frame + * @Return: void + */ + void initPyramidalWindow(unsigned int time_interval); + /** + * @Description: this function is pyramidal window for Clustream, + * which takes and stores the snapshots of micro clusters + * @Param: microClusters: micro clusters' snapshot need to be stored + * elapsedTime: the current elapsed time + * @Return: void + */ + void pyramidalWindowProcess(int elapsedTime, + const MicroClusters µClusters); + /** + * @Description: this function stores snapshots into the pyramidal window data + * structure + * @Param: currentOrder: the ith order snapshots stored into + * microClusters: micro clusters' snapshot need to be stored + * elapsedTime: the current elapsed time + * @Return: void + */ + void storeSnapshot(unsigned int currentOrder, + const MicroClusters µClusters, int elapsedTime); + void clearPyramidalWindow(); }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_SRC_ALGORITHM_WINDOWMODEL_LANDMARKWINDOW_HPP_ +#endif // SESAME_SRC_ALGORITHM_WINDOWMODEL_LANDMARKWINDOW_HPP_ diff --git a/include/Algorithm/WindowModel/WindowFactory.hpp b/include/Algorithm/WindowModel/WindowFactory.hpp index 08d278ff..4ee26d3b 100644 --- a/include/Algorithm/WindowModel/WindowFactory.hpp +++ b/include/Algorithm/WindowModel/WindowFactory.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 27/07/2021. @@ -10,13 +11,12 @@ #include #include #include -namespace SESAME -{ -class WindowFactory -{ +namespace SESAME { +class WindowFactory { public: - static std::shared_ptr createLandmarkWindow(); - static std::shared_ptr createDampedWindow(double base, double lambda); + static std::shared_ptr createLandmarkWindow(); + static std::shared_ptr createDampedWindow(double base, + double lambda); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_ALGORITHM_WINDOWMODEL_WINDOWFACTORY_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_ALGORITHM_WINDOWMODEL_WINDOWFACTORY_HPP_ diff --git a/include/Algorithm/WindowModel/WindowModel.hpp b/include/Algorithm/WindowModel/WindowModel.hpp index 044b4a80..95c8a38f 100644 --- a/include/Algorithm/WindowModel/WindowModel.hpp +++ b/include/Algorithm/WindowModel/WindowModel.hpp @@ -16,81 +16,69 @@ #include #include -namespace SESAME -{ +namespace SESAME { class WindowModel; typedef std::shared_ptr WindowPtr; -class WindowModel -{}; +class WindowModel {}; -class Landmark : WindowModel -{ +class Landmark : WindowModel { private: - int landmark_; + int landmark_; public: - Landmark(const SesameParam ¶m) : landmark_(param.landmark) {} - bool Add(PointPtr input) { return input->index == 0 || input->index % landmark_ != 0; } + Landmark(const SesameParam ¶m) : landmark_(param.landmark) {} + bool Add(PointPtr input) { + return input->index == 0 || input->index % landmark_ != 0; + } }; -class Sliding : WindowModel -{ +class Sliding : WindowModel { private: - int sliding_; - std::queue queue_; + int sliding_; + std::queue queue_; public: - Sliding(const SesameParam ¶m) : sliding_(param.sliding) {} - bool Add(const PointPtr input) - { - queue_.push(input); - return true; - } - PointPtr Delete() - { - if (queue_.size() > sliding_) - { - auto ret = queue_.front(); - queue_.pop(); - return ret; - } - return nullptr; + Sliding(const SesameParam ¶m) : sliding_(param.sliding) {} + bool Add(const PointPtr input) { + queue_.push(input); + return true; + } + PointPtr Delete() { + if (queue_.size() > sliding_) { + auto ret = queue_.front(); + queue_.pop(); + return ret; } + return nullptr; + } }; -class Damped : WindowModel -{ +class Damped : WindowModel { private: - const double alpha_, lambda_; - const int buf_size_; - int cnt_ = 0; + const double alpha_, lambda_; + const int buf_size_; + int cnt_ = 0; public: - Damped(const SesameParam ¶m) - : alpha_(param.alpha), lambda_(param.lambda), buf_size_(param.buf_size) - {} - bool Add(const PointPtr input) - { - ++cnt_; - return true; - } - bool Update() - { - if (cnt_ >= buf_size_) - { - cnt_ = 0; - return true; - } - return false; - } - template - void Update(T node) - { - node->Scale(pow(alpha_, -lambda_)); + Damped(const SesameParam ¶m) + : alpha_(param.alpha), lambda_(param.lambda), buf_size_(param.buf_size) {} + bool Add(const PointPtr input) { + ++cnt_; + return true; + } + bool Update() { + if (cnt_ >= buf_size_) { + cnt_ = 0; + return true; } + return false; + } + template void Update(T node) { + node->Scale(pow(alpha_, -lambda_)); + } }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_SRC_ALGORITHM_WINDOWMODEL_WINDOW_HPP_ +#endif // SESAME_SRC_ALGORITHM_WINDOWMODEL_WINDOW_HPP_ diff --git a/include/Engine/Engine.hpp b/include/Engine/Engine.hpp index aa62d312..4fec3c5f 100644 --- a/include/Engine/Engine.hpp +++ b/include/Engine/Engine.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 19/07/2021. @@ -7,9 +8,7 @@ #ifndef SESAME_INCLUDE_ENGINE_ENGINE_HPP_ #define SESAME_INCLUDE_ENGINE_ENGINE_HPP_ -namespace SESAME -{ -class Engine -{}; -} // namespace SESAME -#endif // SESAME_INCLUDE_ENGINE_ENGINE_HPP_ +namespace SESAME { +class Engine {}; +} // namespace SESAME +#endif // SESAME_INCLUDE_ENGINE_ENGINE_HPP_ diff --git a/include/Engine/SimpleEngine.hpp b/include/Engine/SimpleEngine.hpp index e6e4ddd5..22e30020 100644 --- a/include/Engine/SimpleEngine.hpp +++ b/include/Engine/SimpleEngine.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 19/07/2021. @@ -13,34 +14,35 @@ using namespace std; -namespace SESAME -{ +namespace SESAME { /** * The SingleThreadEngine will spawn one thread for source, sink and algorithm. * TODO: allow to pass in multiple data sources and multiple data sinks. */ -class SimpleEngine : SESAME::Engine -{ +class SimpleEngine : SESAME::Engine { private: - DataSourcePtr sourcePtr; - DataSinkPtr sinkPtr; - AlgorithmPtr algoPtr; - SingleThreadPtr threadPtr; // SimpleEngine has only one thread to run algorithm. - atomic_int threadID; - TimeMeter overallMeter; + DataSourcePtr sourcePtr; + DataSinkPtr sinkPtr; + AlgorithmPtr algoPtr; + SingleThreadPtr + threadPtr; // SimpleEngine has only one thread to run algorithm. + atomic_int threadID; + TimeMeter overallMeter; public: - BarrierPtr barrierPtr; - SimpleEngine(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, AlgorithmPtr algoPtr); - // void createBarrier(); - void run(); // start the engine. - void runningRoutine(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, AlgorithmPtr algoPtr); - bool start(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, AlgorithmPtr algoPtr, - int id); // start the algorithm thread. - bool stop(); - int assignID(); - void printTime(); + BarrierPtr barrierPtr; + SimpleEngine(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, + AlgorithmPtr algoPtr); + // void createBarrier(); + void run(); // start the engine. + void runningRoutine(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, + AlgorithmPtr algoPtr); + bool start(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, AlgorithmPtr algoPtr, + int id); // start the algorithm thread. + bool stop(); + int assignID(); + void printTime(); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_Engine_SINGLETHREADENGINE_H_ +} // namespace SESAME +#endif // SESAME_INCLUDE_Engine_SINGLETHREADENGINE_H_ diff --git a/include/Engine/SingleThread.hpp b/include/Engine/SingleThread.hpp index 17becd54..4a4ceec5 100644 --- a/include/Engine/SingleThread.hpp +++ b/include/Engine/SingleThread.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 20/8/21. @@ -10,23 +11,21 @@ #include #include -namespace SESAME -{ +namespace SESAME { class SingleThread; typedef std::shared_ptr SingleThreadPtr; -class SingleThread -{ +class SingleThread { private: - std::shared_ptr ThreadPtr; - int id; + std::shared_ptr ThreadPtr; + int id; public: - void construct(std::function fun, int id); - void join(); - int getID(); - int setID(int id); + void construct(std::function fun, int id); + void join(); + int getID(); + int setID(int id); }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_SRC_ENGINE_THREAD_HPP_ +#endif // SESAME_SRC_ENGINE_THREAD_HPP_ diff --git a/include/Evaluation/CMM.hpp b/include/Evaluation/CMM.hpp index 3b0a33b2..b0c898b9 100644 --- a/include/Evaluation/CMM.hpp +++ b/include/Evaluation/CMM.hpp @@ -13,19 +13,10 @@ #include #include -namespace SESAME -{ +namespace SESAME { -enum ClusterType -{ - Cluster, - GTCluster -}; -enum ClusterMapType -{ - vote, - similarity -}; +enum ClusterType { Cluster, GTCluster }; +enum ClusterMapType { vote, similarity }; class CMMPoint; class CMMCluster; @@ -34,152 +25,144 @@ class CMM; typedef std::shared_ptr CMMPointPtr; typedef std::shared_ptr CMMClusterPtr; -class CMMPoint -{ +class CMMPoint { public: - int id; - long startTime; - int dim; - std::vector vec; - double weight; - int truth; - double conCL = 0; - double con = 0; - - CMMPoint(PointPtr p) - : id(p->index), - startTime(p->timestamp), - dim(p->dim), - vec(p->feature), - weight(p->weight), - truth(p->clu_id) - {} - - CMMPoint(int id, long startTime, long time, std::vector &vec, double a, double lambda, - int truth); - - double getDisTo(CMMPointPtr &p); - - double knnDis(int k, CMMCluster &c); + int id; + long startTime; + int dim; + std::vector vec; + double weight; + int truth; + double conCL = 0; + double con = 0; + + CMMPoint(PointPtr p) + : id(p->index), startTime(p->timestamp), dim(p->dim), vec(p->feature), + weight(p->weight), truth(p->clu_id) {} + + CMMPoint(int id, long startTime, long time, std::vector &vec, + double a, double lambda, int truth); + + double getDisTo(CMMPointPtr &p); + + double knnDis(int k, CMMCluster &c); }; -class CMMCluster -{ +class CMMCluster { public: - std::vector points; - int groundTruth; - std::vector rho; - double knhDis{}; - ClusterType type; - CMMCluster(); + std::vector points; + int groundTruth; + std::vector rho; + double knhDis{}; + ClusterType type; + CMMCluster(); - void add(CMMPointPtr &p); + void add(CMMPointPtr &p); - // void getDistribution(std::unordered_map &map); + // void getDistribution(std::unordered_map &map); - void getConn(); + void getConn(); }; -class CMMDriver -{ +class CMMDriver { public: - double a; - double lambda; - int dim; - int groundTruth; - - std::vector points; - std::unordered_map CL; - std::vector CLlist; - std::unordered_map C; - std::unordered_map CToCL; - std::vector Clist; - std::vector faultSet; - std::unordered_set faultClu; - - CMMDriver(int dim, double a, double lambda); - void load(const std::vector &inputs, const std::vector &predicts, int dim, - double weight); - void voteMap(); - double computeWeight(double deltaTime); - // int getDelta(CMMClusterPtr ci, CMMClusterPtr cljo); - void getFaultSet(); - double compCMM(); - void compCon(); + double a; + double lambda; + int dim; + int groundTruth; + + std::vector points; + std::unordered_map CL; + std::vector CLlist; + std::unordered_map C; + std::unordered_map CToCL; + std::vector Clist; + std::vector faultSet; + std::unordered_set faultClu; + + CMMDriver(int dim, double a, double lambda); + void load(const std::vector &inputs, + const std::vector &predicts, int dim, double weight); + void voteMap(); + double computeWeight(double deltaTime); + // int getDelta(CMMClusterPtr ci, CMMClusterPtr cljo); + void getFaultSet(); + double compCMM(); + void compCon(); }; -class CMM -{ +class CMM { private: - param_t param; - /** - * number of non-noise points that will create an error due to the underlying - * clustering model (e.g. point being covered by two clusters representing - * different classes) - */ - int noiseErrorByModel = 0; - /** - * number of noise points that will create an error due to the underlying - * clustering model (e.g. noise point being covered by a cluster) - */ - int pointErrorByModel = 0; - /** - * experimental (default: disabled) - * use exponential connectivity function to model different behavior: - * closer points will have a stronger connection compared to the linear - * function. Use ConnRefXValue and ConnX to better parameterize lambda, which - * controls the decay of the connectivity - */ - bool useExpConnectivity = false; - double lambdaConnRefXValue = 0.01; - double lambdaConnX = 4; - double lambdaConn; - /** - * the threshold which defines when ground truth clusters will be merged. - * set to 1 to disable merging - */ - double tauConnection = 0.5; - - /** - * defines how many nearest neighbors will be used - */ - int knnNeighbourhood = 2; - - /** average knn distance of all points in the cluster*/ - double knnMeanAvg = 0; - - /** average deviation of knn distance of all points*/ - double knnDevAvg = 0; - - struct Cluster - { - std::unordered_set points; - std::vector vpoints; - std::atomic knnMeanAvg = 0.0, knnDevAvg = 0.0; - void Insert(int i) - { - points.insert(i); - vpoints.push_back(i); - } - void CalcKnn(int k, const std::vector &inputs); - }; - std::unordered_map clusters; - std::map matchMap; - double cmm; + param_t param; + /** + * number of non-noise points that will create an error due to the underlying + * clustering model (e.g. point being covered by two clusters representing + * different classes) + */ + int noiseErrorByModel = 0; + /** + * number of noise points that will create an error due to the underlying + * clustering model (e.g. noise point being covered by a cluster) + */ + int pointErrorByModel = 0; + /** + * experimental (default: disabled) + * use exponential connectivity function to model different behavior: + * closer points will have a stronger connection compared to the linear + * function. Use ConnRefXValue and ConnX to better parameterize lambda, which + * controls the decay of the connectivity + */ + bool useExpConnectivity = false; + double lambdaConnRefXValue = 0.01; + double lambdaConnX = 4; + double lambdaConn; + /** + * the threshold which defines when ground truth clusters will be merged. + * set to 1 to disable merging + */ + double tauConnection = 0.5; + + /** + * defines how many nearest neighbors will be used + */ + int knnNeighbourhood = 2; + + /** average knn distance of all points in the cluster*/ + double knnMeanAvg = 0; + + /** average deviation of knn distance of all points*/ + double knnDevAvg = 0; + + struct Cluster { + std::unordered_set points; + std::vector vpoints; + std::atomic knnMeanAvg = 0.0, knnDevAvg = 0.0; + void Insert(int i) { + points.insert(i); + vpoints.push_back(i); + } + void CalcKnn(int k, const std::vector &inputs); + }; + std::unordered_map clusters; + std::map matchMap; + double cmm; public: - CMM(param_t param) : param(param) {} - double Evaluate(const std::vector &inputs, const std::vector &predicts); - void AnalyseGT(const std::vector &inputs, const std::vector &predicts, - bool enableClassMerge); - double CalcConn(int, const std::vector &); - double CalcConn(int, int, const std::vector &); - void CalcMatch(const std::vector &inputs, const std::vector &predicts); - void CalcError(const std::vector &inputs, const std::vector &predicts); - double MisplacedError(int, int, int, const std::vector &); - double NoiseError(int, int, int, const std::vector &); - double MissedError(int, int, int, const std::vector &); + CMM(param_t param) : param(param) {} + double Evaluate(const std::vector &inputs, + const std::vector &predicts); + void AnalyseGT(const std::vector &inputs, + const std::vector &predicts, bool enableClassMerge); + double CalcConn(int, const std::vector &); + double CalcConn(int, int, const std::vector &); + void CalcMatch(const std::vector &inputs, + const std::vector &predicts); + void CalcError(const std::vector &inputs, + const std::vector &predicts); + double MisplacedError(int, int, int, const std::vector &); + double NoiseError(int, int, int, const std::vector &); + double MissedError(int, int, int, const std::vector &); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_EVALUATION_CMM_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_EVALUATION_CMM_HPP_ diff --git a/include/Evaluation/Euclidean.hpp b/include/Evaluation/Euclidean.hpp index 5fba9611..b94300c4 100644 --- a/include/Evaluation/Euclidean.hpp +++ b/include/Evaluation/Euclidean.hpp @@ -6,17 +6,17 @@ #define SESAME_INCLUDE_EVALUATION_EUCLIDEAN_HPP_ #include #include -namespace SESAME -{ +namespace SESAME { -class Euclidean -{ +class Euclidean { public: - static void euclideanCost( - int numberOfPoints, int numberOfCenters, int dimention, const std::vector &inputs, - const std::vector &results); // EuclideanCost, defined in StreamKM++: A - // clustering algorithm for data streams + static void + euclideanCost(int numberOfPoints, int numberOfCenters, int dimention, + const std::vector &inputs, + const std::vector + &results); // EuclideanCost, defined in StreamKM++: A + // clustering algorithm for data streams }; -} // namespace SESAME -#endif // SESAME_INCLUDE_EVALUATION_EUCLIDEAN_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_EVALUATION_EUCLIDEAN_HPP_ diff --git a/include/Evaluation/Evaluation.hpp b/include/Evaluation/Evaluation.hpp index d2c7e506..f4314854 100644 --- a/include/Evaluation/Evaluation.hpp +++ b/include/Evaluation/Evaluation.hpp @@ -14,47 +14,42 @@ #include -namespace SESAME -{ - -struct AccuracyRes -{ - double cmm = 0.0, purity = 0.0, nmi = 0.0; - int num_res = 0; - AccuracyRes() = default; - void Evaluate(const param_t ¶m, const std::vector &inputs, - const std::vector &predicts); - void Print() - { - std::cout << "num_res: " << num_res << std::endl; - std::cout << "cmm: " << cmm << std::endl; - std::cout << "purity: " << purity << std::endl; - std::cout << "nmi: " << nmi << std::endl; - } +namespace SESAME { + +struct AccuracyRes { + double cmm = 0.0, purity = 0.0, nmi = 0.0; + int num_res = 0; + AccuracyRes() = default; + void Evaluate(const param_t ¶m, const std::vector &inputs, + const std::vector &predicts); + void Print() { + std::cout << "num_res: " << num_res << std::endl; + std::cout << "cmm: " << cmm << std::endl; + std::cout << "purity: " << purity << std::endl; + std::cout << "nmi: " << nmi << std::endl; + } }; -struct PerfRes -{ - int64 win_us, ds_us, out_us, ref_us, sum_us; - double on_20 = 0.0, on_40 = 0.0, on_60 = 0.0, on_80 = 0.0, on_100 = 0.0; - double lat_us, et_s, qps; - void Print() - { - std::cout << "win_us: " << win_us << std::endl; - std::cout << "ds_us: " << ds_us << std::endl; - std::cout << "out_us: " << out_us << std::endl; - std::cout << "ref_us: " << ref_us << std::endl; - std::cout << "sum_us: " << sum_us << std::endl; - std::cout << "on_20: " << on_20 << std::endl; - std::cout << "on_40: " << on_40 << std::endl; - std::cout << "on_60: " << on_60 << std::endl; - std::cout << "on_80: " << on_80 << std::endl; - std::cout << "on_100: " << on_100 << std::endl; - std::cout << "lat_us: " << lat_us << std::endl; - std::cout << "et_s: " << et_s << std::endl; - std::cout << "qps: " << qps << std::endl; - } +struct PerfRes { + int64 win_us, ds_us, out_us, ref_us, sum_us; + double on_20 = 0.0, on_40 = 0.0, on_60 = 0.0, on_80 = 0.0, on_100 = 0.0; + double lat_us, et_s, qps; + void Print() { + std::cout << "win_us: " << win_us << std::endl; + std::cout << "ds_us: " << ds_us << std::endl; + std::cout << "out_us: " << out_us << std::endl; + std::cout << "ref_us: " << ref_us << std::endl; + std::cout << "sum_us: " << sum_us << std::endl; + std::cout << "on_20: " << on_20 << std::endl; + std::cout << "on_40: " << on_40 << std::endl; + std::cout << "on_60: " << on_60 << std::endl; + std::cout << "on_80: " << on_80 << std::endl; + std::cout << "on_100: " << on_100 << std::endl; + std::cout << "lat_us: " << lat_us << std::endl; + std::cout << "et_s: " << et_s << std::endl; + std::cout << "qps: " << qps << std::endl; + } }; -} // namespace SESAME -#endif // ONLINEMLBENCHMARK_EVALUATIONMETRICS_H +} // namespace SESAME +#endif // ONLINEMLBENCHMARK_EVALUATIONMETRICS_H diff --git a/include/Evaluation/NMI.hpp b/include/Evaluation/NMI.hpp index 0c1c8eca..276afe69 100644 --- a/include/Evaluation/NMI.hpp +++ b/include/Evaluation/NMI.hpp @@ -6,14 +6,13 @@ #define SESAME_INCLUDE_EVALUATION_NMI_HPP_ #include #include -namespace SESAME -{ +namespace SESAME { -class NMI -{ +class NMI { public: - static double Evaluate(const std::vector &inputs, - const std::vector &predicts, int gt_size, int predict_size); + static double Evaluate(const std::vector &inputs, + const std::vector &predicts, int gt_size, + int predict_size); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_EVALUATION_NMI_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_EVALUATION_NMI_HPP_ diff --git a/include/Evaluation/Purity.hpp b/include/Evaluation/Purity.hpp index 722677e2..07ff4ad7 100644 --- a/include/Evaluation/Purity.hpp +++ b/include/Evaluation/Purity.hpp @@ -6,23 +6,22 @@ #define ONLINEMLBENCHMARK_PURITY_HPP_ #include #include -namespace SESAME -{ +namespace SESAME { -class Purity -{ +class Purity { private: - static void pointToGroup(const std::vector &input, - std::vector> &group); - static double calculateBelongsFromTwo(std::vector &groupA, - std::vector &groupB); - static double getMaxBelongs(std::vector &singleSample, - std::vector> >); + static void pointToGroup(const std::vector &input, + std::vector> &group); + static double calculateBelongsFromTwo(std::vector &groupA, + std::vector &groupB); + static double getMaxBelongs(std::vector &singleSample, + std::vector> >); public: - static double purityCost(const std::vector &inputs, - const std::vector &predicts, int dim, bool decay); + static double purityCost(const std::vector &inputs, + const std::vector &predicts, int dim, + bool decay); }; -} // namespace SESAME +} // namespace SESAME #endif \ No newline at end of file diff --git a/include/Python/Benne.hpp b/include/Python/Benne.hpp new file mode 100644 index 00000000..5357a607 --- /dev/null +++ b/include/Python/Benne.hpp @@ -0,0 +1,162 @@ +#ifndef SESAME_PYTHON_INCLUDE_BENNE_HPP_ +#define SESAME_PYTHON_INCLUDE_BENNE_HPP_ + +#include +#include + +#include "Algorithm/Algorithm.hpp" +#include "Algorithm/AlgorithmFactory.hpp" +#include "Algorithm/DataStructure/GenericFactory.hpp" + +#include +#include + +using namespace pybind11::literals; +using namespace SESAME; +using namespace std; +namespace py = pybind11; + +class Benne { +public: + Benne(int dim, int n_clusters, double threshold, int queue_size_threshold, + int dim_threshold, double variance_threshold, + int outliers_num_threshold, double outliers_dist_threshold) + : dim(dim), n_clusters(n_clusters), distance_threshold(threshold), + queue_size_threshold(queue_size_threshold), + dim_threshold(dim_threshold), variance_threshold(variance_threshold), + outliers_num_threshold(outliers_num_threshold), + outliers_dist_threshold(outliers_dist_threshold) { + param.algo = AlgoType::BenneType; + param.landmark = 10; + param.num_clusters = n_clusters; + param.dim = dim; + param.coreset_size = 2; + param.distance_threshold = distance_threshold; + param.benne_threshold.queue_size = queue_size_threshold; + param.benne_threshold.dim = dim_threshold; + param.benne_threshold.variance = variance_threshold; + param.benne_threshold.outliers_num = outliers_num_threshold; + param.benne_threshold.outliers_dist = outliers_dist_threshold; + algo = AlgorithmFactory::create(param); + sinkPtr = GenericFactory::New(param); + auto barrierPtr = GenericFactory::New>(1); + sinkPtr->setBarrier(barrierPtr); + sinkPtr->start(0); + algo->Init(); + } + + Benne &fit(py::array_t X) { + algo = AlgorithmFactory::create(param); + sinkPtr = GenericFactory::New(param); + auto barrierPtr = GenericFactory::New>(1); + sinkPtr->setBarrier(barrierPtr); + sinkPtr->start(0); + algo->Init(); + + partial_fit(X); + return *this; + } + + py::array_t fit_predict(py::array_t X) { + py::buffer_info buf = X.request(); + + if (buf.ndim != 2) + throw std::runtime_error("numpy.ndarray dims must be 2!"); + + double *ptr = static_cast(buf.ptr); + + auto num_elements = buf.shape[0] * buf.shape[1]; + auto num_vectors = num_elements / dim; + for (int i = 0; i < num_vectors; i++) { + PointPtr point = std::make_shared(dim, id++, ptr + i * dim); + inputs.push_back(point); + algo->RunOnline(point); + } + + algo->RunOffline(sinkPtr); + std::vector &results = sinkPtr->getResults(), predicts; + UtilityFunctions::groupByCenters(inputs, results, predicts, param.dim); + std::vector labels; + for (auto &point : predicts) { + labels.push_back(point->clu_id); + } + return py::array_t(labels.size(), labels.data()); + } + + py::dict get_params() { + return py::dict("distance_threshold"_a = distance_threshold, + "queue_size_threshold"_a = queue_size_threshold, + "dim_threshold"_a = dim_threshold, + "variance_threshold"_a = variance_threshold, + "outliers_num_threshold"_a = outliers_num_threshold, + "outliers_dist_threshold"_a = outliers_dist_threshold, + "n_clusters"_a = n_clusters); + } + + Benne &partial_fit(py::array_t X) { + py::buffer_info buf = X.request(); + + if (buf.ndim != 2) + throw std::runtime_error("numpy.ndarray dims must be 2!"); + + double *ptr = static_cast(buf.ptr); + + auto num_elements = buf.shape[0] * buf.shape[1]; + auto num_vectors = num_elements / dim; + for (int i = 0; i < num_vectors; i++) { + PointPtr point = std::make_shared(dim, id++, ptr + i * dim); + algo->RunOnline(point); + } + return *this; + } + + py::array_t predict(py::array_t X) { + py::buffer_info buf = X.request(); + + if (buf.ndim != 2) + throw std::runtime_error("numpy.ndarray dims must be 2!"); + + double *ptr = static_cast(buf.ptr); + + vector inputs; + + auto num_elements = buf.shape[0] * buf.shape[1]; + auto num_vectors = num_elements / dim; + for (int i = 0; i < num_vectors; i++) { + SESAME::PointPtr point = + std::make_shared(dim, id++, ptr + i * dim); + inputs.push_back(point); + } + algo->RunOffline(sinkPtr); + std::vector &results = sinkPtr->getResults(), predicts; + UtilityFunctions::groupByCenters(inputs, results, predicts, param.dim); + std::vector labels; + for (auto &point : predicts) { + labels.push_back(point->clu_id); + } + return py::array_t(labels.size(), labels.data()); + } + + void set_output(bool output) { throw std::runtime_error("Not supported"); } + + void set_params(py::dict params) { + throw std::runtime_error("Not supported"); + } + +private: + double distance_threshold = 0.5; + int queue_size_threshold = 1; + int dim_threshold = 30; + double variance_threshold = 100.0; + int outliers_num_threshold = 200; + double outliers_dist_threshold = 50.0; + int n_clusters = 2; + int dim; + uint64_t id = 0; + SESAME::SesameParam param; + SESAME::AlgorithmPtr algo; + std::vector inputs; + DataSinkPtr sinkPtr; +}; + +#endif // SESAME_PYTHON_INCLUDE_BENNE_HPP_ \ No newline at end of file diff --git a/include/Python/Birch.hpp b/include/Python/Birch.hpp new file mode 100644 index 00000000..b780c3d3 --- /dev/null +++ b/include/Python/Birch.hpp @@ -0,0 +1,144 @@ +#include "pybind11/numpy.h" +#include "pybind11/pybind11.h" + +#include "Algorithm/Algorithm.hpp" +#include "Algorithm/AlgorithmFactory.hpp" +#include "Algorithm/DataStructure/GenericFactory.hpp" + +#include +#include + +using namespace pybind11::literals; +using namespace SESAME; +using namespace std; +namespace py = pybind11; + +class Birch { +public: + Birch(int dim, int n_clusters, double threshold, int branching_factor) + : threshold(threshold), branching_factor(branching_factor), + n_clusters(n_clusters), dim(dim) { + param.algo = AlgoType::BirchType; + param.distance_threshold = threshold; + param.landmark = 10; + param.max_in_nodes = branching_factor; + param.max_leaf_nodes = branching_factor; + param.num_clusters = n_clusters; + param.dim = dim; + algo = AlgorithmFactory::create(param); + sinkPtr = GenericFactory::New(param); + auto barrierPtr = GenericFactory::New>(1); + sinkPtr->setBarrier(barrierPtr); + sinkPtr->start(0); + algo->Init(); + } + + Birch &fit(py::array_t X) { + algo = AlgorithmFactory::create(param); + sinkPtr = GenericFactory::New(param); + auto barrierPtr = GenericFactory::New>(1); + sinkPtr->setBarrier(barrierPtr); + sinkPtr->start(0); + algo->Init(); + + partial_fit(X); + return *this; + } + + py::array_t fit_predict(py::array_t X) { + py::buffer_info buf = X.request(); + + if (buf.ndim != 2) + throw std::runtime_error("numpy.ndarray dims must be 2!"); + + double *ptr = static_cast(buf.ptr); + + auto num_elements = buf.shape[0] * buf.shape[1]; + auto num_vectors = num_elements / dim; + for (int i = 0; i < num_vectors; i++) { + PointPtr point = std::make_shared(dim, id++, ptr + i * dim); + inputs.push_back(point); + algo->RunOnline(point); + } + + algo->RunOffline(sinkPtr); + std::vector &results = sinkPtr->getResults(), predicts; + UtilityFunctions::groupByCenters(inputs, results, predicts, param.dim); + // cout << "results size: " << results.size() << endl; + // cout << "inputs size: " << inputs.size() << endl; + // cout << "predicts size: " << predicts.size() << endl; + std::vector labels; + for (auto &point : predicts) { + labels.push_back(point->clu_id); + } + return py::array_t(labels.size(), labels.data()); + } + + py::dict get_params() { + return py::dict("threshold"_a = threshold, + "branching_factor"_a = branching_factor, + "n_clusters"_a = n_clusters); + } + + Birch &partial_fit(py::array_t X) { + py::buffer_info buf = X.request(); + + if (buf.ndim != 2) + throw std::runtime_error("numpy.ndarray dims must be 2!"); + + double *ptr = static_cast(buf.ptr); + + auto num_elements = buf.shape[0] * buf.shape[1]; + auto num_vectors = num_elements / dim; + for (int i = 0; i < num_vectors; i++) { + PointPtr point = std::make_shared(dim, id++, ptr + i * dim); + algo->RunOnline(point); + } + return *this; + } + + py::array_t predict(py::array_t X) { + py::buffer_info buf = X.request(); + + if (buf.ndim != 2) + throw std::runtime_error("numpy.ndarray dims must be 2!"); + + double *ptr = static_cast(buf.ptr); + + vector inputs; + + auto num_elements = buf.shape[0] * buf.shape[1]; + auto num_vectors = num_elements / dim; + for (int i = 0; i < num_vectors; i++) { + SESAME::PointPtr point = + std::make_shared(dim, id++, ptr + i * dim); + inputs.push_back(point); + } + algo->RunOffline(sinkPtr); + std::vector &results = sinkPtr->getResults(), predicts; + UtilityFunctions::groupByCenters(inputs, results, predicts, param.dim); + std::vector labels; + for (auto &point : predicts) { + labels.push_back(point->clu_id); + } + return py::array_t(labels.size(), labels.data()); + } + + void set_output(bool output) { throw std::runtime_error("Not supported"); } + + void set_params(py::dict params) { + throw std::runtime_error("Not supported"); + } + +private: + double threshold; + int branching_factor; + int n_clusters; + int dim; + bool output; + uint64_t id = 0; + SesameParam param; + AlgorithmPtr algo; + std::vector inputs; + DataSinkPtr sinkPtr; +}; diff --git a/include/Sinks/DataSink.hpp b/include/Sinks/DataSink.hpp index 47936cd1..0bfc1742 100644 --- a/include/Sinks/DataSink.hpp +++ b/include/Sinks/DataSink.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 20/07/2021. @@ -19,33 +20,31 @@ #include #include -namespace SESAME -{ +namespace SESAME { class DataSink; typedef std::shared_ptr DataSinkPtr; -class DataSink -{ +class DataSink { private: - std::vector output; - std::shared_ptr> outputQueue; - SingleThreadPtr threadPtr; - std::atomic_bool sourceEnd; - std::atomic_bool finished; - BarrierPtr barrierPtr; - param_t param; + std::vector output; + std::shared_ptr> outputQueue; + SingleThreadPtr threadPtr; + std::atomic_bool sourceEnd; + std::atomic_bool finished; + BarrierPtr barrierPtr; + param_t param; public: - DataSink(const param_t &); - ~DataSink(); - void put(PointPtr resultPtr); - void runningRoutine(); - bool start(int id); - bool stop(); - void Ended(); - bool isFinished(); - std::vector getResults(); - void setBarrier(BarrierPtr barrierPtr); + DataSink(const param_t &); + ~DataSink(); + void put(PointPtr resultPtr); + void runningRoutine(); + bool start(int id); + bool stop(); + void Ended(); + bool isFinished(); + std::vector &getResults(); + void setBarrier(BarrierPtr barrierPtr); }; -} // namespace SESAME -#endif // SESAME_INCLUDE_SINKS_DATASINK_HPP_ +} // namespace SESAME +#endif // SESAME_INCLUDE_SINKS_DATASINK_HPP_ diff --git a/include/Sinks/DataSinkFactory.hpp b/include/Sinks/DataSinkFactory.hpp index 5a40a05c..0704af4f 100644 --- a/include/Sinks/DataSinkFactory.hpp +++ b/include/Sinks/DataSinkFactory.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 17/8/21. @@ -7,13 +8,11 @@ #ifndef SESAME_SRC_SINKS_DATASINKFACTORY_HPP_ #define SESAME_SRC_SINKS_DATASINKFACTORY_HPP_ #include -namespace SESAME -{ -class DataSinkFactory -{ +namespace SESAME { +class DataSinkFactory { public: - static SESAME::DataSinkPtr create(); + static SESAME::DataSinkPtr create(); }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_SRC_SINKS_DATASINKFACTORY_HPP_ +#endif // SESAME_SRC_SINKS_DATASINKFACTORY_HPP_ diff --git a/include/Sources/DataSource.hpp b/include/Sources/DataSource.hpp index 661b1e77..df056870 100644 --- a/include/Sources/DataSource.hpp +++ b/include/Sources/DataSource.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 20/07/2021. @@ -23,37 +24,35 @@ #include #include -namespace SESAME -{ +namespace SESAME { class DataSource; typedef std::shared_ptr DataSourcePtr; -class DataSource -{ +class DataSource { private: - std::vector input; - std::shared_ptr> inputQueue; - SingleThreadPtr threadPtr; - BarrierPtr barrierPtr; - TimeMeter overallMeter; - std::atomic_bool sourceEnd; - param_t param; + std::vector input; + std::shared_ptr> inputQueue; + SingleThreadPtr threadPtr; + BarrierPtr barrierPtr; + TimeMeter overallMeter; + std::atomic_bool sourceEnd; + param_t param; public: - void load(); - bool empty(); - PointPtr get(); - std::vector getInputs(); - DataSource(const param_t &); - ~DataSource(); - void runningRoutine(); - bool start(int i); - bool stop(); - void setBarrier(BarrierPtr barrierPtr); - void printTime(); - bool sourceEnded(); - int size() { return inputQueue->read_available(); } - void push(const PointPtr &p); + void load(); + bool empty(); + PointPtr get(); + std::vector getInputs(); + DataSource(const param_t &); + ~DataSource(); + void runningRoutine(); + bool start(int i); + bool stop(); + void setBarrier(BarrierPtr barrierPtr); + void printTime(); + bool sourceEnded(); + int size() { return inputQueue->read_available(); } + void push(const PointPtr &p); }; -} // namespace SESAME -#endif // SESAME_SRC_SOURCES_DATASOURCE_HPP_ +} // namespace SESAME +#endif // SESAME_SRC_SOURCES_DATASOURCE_HPP_ diff --git a/include/Sources/DataSourceFactory.hpp b/include/Sources/DataSourceFactory.hpp index e814ed97..d240c3de 100644 --- a/include/Sources/DataSourceFactory.hpp +++ b/include/Sources/DataSourceFactory.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 15/8/2021. @@ -9,13 +10,11 @@ #include -namespace SESAME -{ -class DataSourceFactory -{ +namespace SESAME { +class DataSourceFactory { public: - static SESAME::DataSourcePtr create(); + static SESAME::DataSourcePtr create(); }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_DATASOURCEFACTORY_H +#endif // SESAME_DATASOURCEFACTORY_H diff --git a/include/Timer/TimeMeter.hpp b/include/Timer/TimeMeter.hpp index 7aace843..fdb104ba 100644 --- a/include/Timer/TimeMeter.hpp +++ b/include/Timer/TimeMeter.hpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) #ifndef SESAME_SRC_TIMER_CYCLEMETER_HPP_ #define SESAME_SRC_TIMER_CYCLEMETER_HPP_ @@ -14,342 +15,352 @@ #include #include -namespace SESAME -{ -struct T_TIMER -{ +namespace SESAME { +struct T_TIMER { #ifndef NO_TIMING - // the start and end timestamp of the algorithm - struct timespec start, end, overallPre; - /** - * accumulate for data point coming - * online_increment_timer_pre: start of accumulate online timer ( start of every xx s) - * online_timer: end of the online increment part - * */ - timespec online_increment_timer_pre, onlineAccTimer, online_timer; - - /** - * initialTimerStart: Start of initial part - * initialTimer: end of initial part - * */ - timespec initialTimerStart, initialTimer; - - /** - * accumulate for data point coming - * dataInsertTimer_pre: start of accumulated data insertion timer ( start of every xx s) - * dataInsertTimer: end of the online increment part - **/ - - timespec dataInsertTimer_pre, dataInsertTimer; // accumulate. - - /** - * accumulate for data point coming - * clusterUpdateTimer_pre: start of accumulated cluster update timer ( start of every xx s) - * clusterUpdateTimer: end of the cluster update part - **/ - timespec clusterUpdateTimer_pre, clusterUpdateTimer; // accumulate. - /** - * accumulate for data point coming - * outlierDetectionTimer_pre: start of accumulated outlier Detection timer ( start of every xx - *s) outlierDetectionTimer: end of the outlier Detection part - **/ - - timespec outlierDetectionTimer_pre, outlierDetectionTimer; // accumulate. - - /** - * accumulate for data point coming - * pruneTimer_pre: start of accumulated prune timer (when every periodical prune function - *starts) pruneTimer: end of the prune part - **/ - - timespec pruneTimer_pre, pruneTimer; // accumulate. - - /** - * accumulate for data point coming - * pruneTimer_pre: start of accumulated snapshot timer - * pruneTimer: end of the snapshot part - **/ - timespec snapshotTimer_pre, snapshotTimer; // accumulate (Special for CluStream key design) - timespec finalCluster_pre, - finalClusterTimer; // accumulate (Special for online or periodical forming final cluster) - - /** - * refinementStart: Start of refinement part - * refinementTimer: end of refinement part - **/ - - timespec refinementStart, refinementTimer; // offline refinement - - // Store the overall elapsed time every xx s - std::vector recordOverall; - - // Store the overall elapsed time every xx s - std::vector recordOnline; - - // Store the data insertion elapsed time every xx s - std::vector recordInsert; - - // Store the data Concept drift elapsed time every xx s - std::vector recordConceptDrift; - - // Store the data Outlier Detection every xx s - std::vector recordOutlierDetection; - - // Store the data prune elapsed time when it occurs - std::vector prune; - - std::vector snapshot; - - std::vector finalCluster; - - int pruneCnt = 0; - int snapshotCnt = 0; - int periodicalCluCnt = 0; - - /* - uint64_t overall_timer, online_increment_timer_pre, online_increment_timer;//accumulate for data - point coming, uint64_t online_timer; uint64_t initialTimer = 0;//initialTimer_pre = 0, uint64_t - dataInsertTimer_pre = 0, dataInsertTimer = 0;//accumulate. uint64_t conceptDriftTimer_pre = 0, - conceptDriftTimer = 0;//accumulate. uint64_t outlierDetectionTimer_pre = 0, - outlierDetectionTimer = 0;//accumulate. uint64_t pruneTimer_pre = 0, pruneTimer = - 0;//accumulate. uint64_t snapshotTimer_pre = 0, snapshotTimer = 0;//accumulate(Special for - CluStream key design) uint64_t refinementTimer = 0;//offline refinement std::vector - recordOverall; std::vector recordInsert; std::vector recordConceptDrift; - std::vector recordOutlierDetection; - std::vector recordID; - */ + // the start and end timestamp of the algorithm + struct timespec start, end, overallPre; + /** + * accumulate for data point coming + * online_increment_timer_pre: start of accumulate online timer ( start of + * every xx s) online_timer: end of the online increment part + * */ + timespec online_increment_timer_pre, onlineAccTimer, online_timer; + + /** + * initialTimerStart: Start of initial part + * initialTimer: end of initial part + * */ + timespec initialTimerStart, initialTimer; + + /** + * accumulate for data point coming + * dataInsertTimer_pre: start of accumulated data insertion timer ( start of + *every xx s) dataInsertTimer: end of the online increment part + **/ + + timespec dataInsertTimer_pre, dataInsertTimer; // accumulate. + + /** + * accumulate for data point coming + * clusterUpdateTimer_pre: start of accumulated cluster update timer ( start + *of every xx s) clusterUpdateTimer: end of the cluster update part + **/ + timespec clusterUpdateTimer_pre, clusterUpdateTimer; // accumulate. + /** + * accumulate for data point coming + * outlierDetectionTimer_pre: start of accumulated outlier Detection timer ( + *start of every xx s) outlierDetectionTimer: end of the outlier Detection + *part + **/ + + timespec outlierDetectionTimer_pre, outlierDetectionTimer; // accumulate. + + /** + * accumulate for data point coming + * pruneTimer_pre: start of accumulated prune timer (when every periodical + *prune function starts) pruneTimer: end of the prune part + **/ + + timespec pruneTimer_pre, pruneTimer; // accumulate. + + /** + * accumulate for data point coming + * pruneTimer_pre: start of accumulated snapshot timer + * pruneTimer: end of the snapshot part + **/ + timespec snapshotTimer_pre, + snapshotTimer; // accumulate (Special for CluStream key design) + timespec finalCluster_pre, + finalClusterTimer; // accumulate (Special for online or periodical forming + // final cluster) + + /** + * refinementStart: Start of refinement part + * refinementTimer: end of refinement part + **/ + + timespec refinementStart, refinementTimer; // offline refinement + + // Store the overall elapsed time every xx s + std::vector recordOverall; + + // Store the overall elapsed time every xx s + std::vector recordOnline; + + // Store the data insertion elapsed time every xx s + std::vector recordInsert; + + // Store the data Concept drift elapsed time every xx s + std::vector recordConceptDrift; + + // Store the data Outlier Detection every xx s + std::vector recordOutlierDetection; + + // Store the data prune elapsed time when it occurs + std::vector prune; + + std::vector snapshot; + + std::vector finalCluster; + + int pruneCnt = 0; + int snapshotCnt = 0; + int periodicalCluCnt = 0; + + /* + uint64_t overall_timer, online_increment_timer_pre, + online_increment_timer;//accumulate for data point coming, uint64_t + online_timer; uint64_t initialTimer = 0;//initialTimer_pre = 0, uint64_t + dataInsertTimer_pre = 0, dataInsertTimer = 0;//accumulate. uint64_t + conceptDriftTimer_pre = 0, conceptDriftTimer = 0;//accumulate. uint64_t + outlierDetectionTimer_pre = 0, outlierDetectionTimer = 0;//accumulate. + uint64_t pruneTimer_pre = 0, pruneTimer = 0;//accumulate. uint64_t + snapshotTimer_pre = 0, snapshotTimer = 0;//accumulate(Special for CluStream + key design) uint64_t refinementTimer = 0;//offline refinement + std::vector recordOverall; std::vector recordInsert; + std::vector recordConceptDrift; std::vector + recordOutlierDetection; std::vector recordID; + */ #endif }; -class TimeMeter -{ +class TimeMeter { private: - struct timespec start, stop; - int interval = 1, intervalCnt = 0; - T_TIMER timer; - bool InsertJudge = false; - // the overall elapsed time of every part - long overallTime = 0; - long overallPreTime = 0; - long onlineTime = 0; - long dataInsertTime = 0; - long onlineClusterUpdateTime = 0; - std::vector recordOverall; - - long outlierDetectionTime = 0; // if possible - long pruneTime = 0; // if possible - long initialTime = 0; // if possible - long snapshotTime = 0; // if possible - long refinementTime = 0; // if possible - long finalClusterTime = 0; // if possible - long otherTime = 0; + struct timespec start, stop; + int interval = 1, intervalCnt = 0; + T_TIMER timer; + bool InsertJudge = false; + // the overall elapsed time of every part + long overallTime = 0; + long overallPreTime = 0; + long onlineTime = 0; + long dataInsertTime = 0; + long onlineClusterUpdateTime = 0; + std::vector recordOverall; + + long outlierDetectionTime = 0; // if possible + long pruneTime = 0; // if possible + long initialTime = 0; // if possible + long snapshotTime = 0; // if possible + long refinementTime = 0; // if possible + long finalClusterTime = 0; // if possible + long otherTime = 0; public: - void setInterval(int interV); - void START_MEASURE(); - void END_MEASURE(); - long MeterUSEC(); // return the meter result in micro second unit. - - // static void MEASURE(timespec Time); - long MeterUSEC(timespec startAcc, - timespec endAcc); // return the meter result in micro second unit. - - // the overall start and end time of every part - void overallStartMeasure(); - void overallEndMeasure(); - long MeterOverallUSEC(); - // the start of every xx s - void overallAccMeasure(); - // start of online part - // void MeterOverallAccUSEC(); - - void onlineAccMeasure(); - void onlineAccEMeasure(); - // end of online part - void onlineEndMeasure(); - void MeterOnlineAccUSEC(); - long MeterOnlineUSEC(); - - // start of initial part - void initialMeasure(); - // end of initial part - void initialEndMeasure(); - long MeterInitialUSEC(); - - // start of data Insert part - void dataInsertAccMeasure(); - // end of data Insert part - void dataInsertEndMeasure(); - void MeterDataInsertAccUSEC(); - long MeterDataInsertUSEC(); - - // start of Online cluster update part - void clusterUpdateAccMeasure(); - // end of Online cluster update - void clusterUpdateEndMeasure(); - void MeterClusterUpdateAccUSEC(); - long MeterClusterUpdateUSEC(); - - // start of outlier Detection part - void outlierDetectionAccMeasure(); - // end of outlier Detection part - void outlierDetectionEndMeasure(); - void MeterOutlierDetectionAccUSEC(); - long MeterOutlierDetectionUSEC(); - - // start of prune part - void pruneAccMeasure(); - // end of prune part - void pruneEndMeasure(); - void MeterPruneAccUSEC(); - long MeterPruneUSEC(); - - // start of snapshot part - void snapshotAccMeasure(); - // end of snapshot part - void snapshotEndMeasure(); - void MeterSnapshotAccUSEC(); - long MeterSnapshotUSEC(); - - // Used for incremental or periodical - // start of forming final clusters - void finalClusterAccMeasure(); - // end of forming final clusters - void finalClusterEndMeasure(); - void MeterFinalClusterAccUSEC(); - long MeterFinalClusterUSEC(); - - // start of refinement part - void refinementStartMeasure(); - // end of refinement part - void refinementEndMeasure(); - long MeterRefinementUSEC(); - - // Store the result of every xx s - void AccumulateWithPointTimer(timespec start, timespec end, long elapsedTime, - std::vector timerVector); - long getOnlineEtime(); - - void OverallPreUpdate(); - - void setOverallTime(long overallT); - void setOnlineTime(long onlineT); - void setDataInsertTime(long dataInsertT); - void setClusterUpdateTime(long clusterUpdateT); - void setOutlierDetectionTime(long outlierDetectionT); - - void setInitialTime(long initialT); - // if possible - void setRefinementTime(long refinementT); - // if possible - void setSnapshotTime(long snapshotT); - void setPruneTime(long pruneT); - - /** print out the execution time statistics of stream clustering algorithms */ - void breakdown_global(bool initial, bool snapshot, bool outlierBuffer, - bool refine); // int64_t total_results, - /** print out the execution time statistics of stream clustering algorithms */ - void printTime(bool initial, bool snapshot, bool outlierBuffer, bool finalCluster); // - void printCumulative(); - // TODO the code below will be removed later - /* - #ifndef BEGIN_MEASURE_INITIALIZE - #define BEGIN_MEASURE_INITIALIZE(timer) \ - startTimer(&(timer)->initialTimer); - #endif - - #ifndef END_MEASURE_INITIALIZE - #define END_MEASURE_INITIALIZE(timer) \ - stopTimer(&(timer)->initialTimer); - #endif - - - #ifndef BEGIN_MEASURE_ONLINE - #define BEGIN_MEASURE_ONLINE(timer) \ - startTimer(&(timer)->online_timer); - #endif - - #ifndef END_MEASURE_ONLINE - #define END_MEASURE_ONLINE(timer) \ - stopTimer(&(timer)->online_timer); - #endif - - #ifndef BEGIN_MEASURE_ONLINE_ACC - #define BEGIN_MEASURE_ONLINE_ACC(timer) \ - startTimer(&(timer)->online_timer_pre); - #endif - */ + void setInterval(int interV); + void START_MEASURE(); + void END_MEASURE(); + long MeterUSEC(); // return the meter result in micro second unit. + + // static void MEASURE(timespec Time); + long + MeterUSEC(timespec startAcc, + timespec endAcc); // return the meter result in micro second unit. + + // the overall start and end time of every part + void overallStartMeasure(); + void overallEndMeasure(); + long MeterOverallUSEC(); + // the start of every xx s + void overallAccMeasure(); + // start of online part + // void MeterOverallAccUSEC(); + + void onlineAccMeasure(); + void onlineAccEMeasure(); + // end of online part + void onlineEndMeasure(); + void MeterOnlineAccUSEC(); + long MeterOnlineUSEC(); + + // start of initial part + void initialMeasure(); + // end of initial part + void initialEndMeasure(); + long MeterInitialUSEC(); + + // start of data Insert part + void dataInsertAccMeasure(); + // end of data Insert part + void dataInsertEndMeasure(); + void MeterDataInsertAccUSEC(); + long MeterDataInsertUSEC(); + + // start of Online cluster update part + void clusterUpdateAccMeasure(); + // end of Online cluster update + void clusterUpdateEndMeasure(); + void MeterClusterUpdateAccUSEC(); + long MeterClusterUpdateUSEC(); + + // start of outlier Detection part + void outlierDetectionAccMeasure(); + // end of outlier Detection part + void outlierDetectionEndMeasure(); + void MeterOutlierDetectionAccUSEC(); + long MeterOutlierDetectionUSEC(); + + // start of prune part + void pruneAccMeasure(); + // end of prune part + void pruneEndMeasure(); + void MeterPruneAccUSEC(); + long MeterPruneUSEC(); + + // start of snapshot part + void snapshotAccMeasure(); + // end of snapshot part + void snapshotEndMeasure(); + void MeterSnapshotAccUSEC(); + long MeterSnapshotUSEC(); + + // Used for incremental or periodical + // start of forming final clusters + void finalClusterAccMeasure(); + // end of forming final clusters + void finalClusterEndMeasure(); + void MeterFinalClusterAccUSEC(); + long MeterFinalClusterUSEC(); + + // start of refinement part + void refinementStartMeasure(); + // end of refinement part + void refinementEndMeasure(); + long MeterRefinementUSEC(); + + // Store the result of every xx s + void AccumulateWithPointTimer(timespec start, timespec end, long elapsedTime, + std::vector timerVector); + long getOnlineEtime(); + + void OverallPreUpdate(); + + void setOverallTime(long overallT); + void setOnlineTime(long onlineT); + void setDataInsertTime(long dataInsertT); + void setClusterUpdateTime(long clusterUpdateT); + void setOutlierDetectionTime(long outlierDetectionT); + + void setInitialTime(long initialT); + // if possible + void setRefinementTime(long refinementT); + // if possible + void setSnapshotTime(long snapshotT); + void setPruneTime(long pruneT); + + /** print out the execution time statistics of stream clustering algorithms */ + void breakdown_global(bool initial, bool snapshot, bool outlierBuffer, + bool refine); // int64_t total_results, + /** print out the execution time statistics of stream clustering algorithms */ + void printTime(bool initial, bool snapshot, bool outlierBuffer, + bool finalCluster); // + void printCumulative(); + // TODO the code below will be removed later + /* + #ifndef BEGIN_MEASURE_INITIALIZE + #define BEGIN_MEASURE_INITIALIZE(timer) \ + startTimer(&(timer)->initialTimer); + #endif + + #ifndef END_MEASURE_INITIALIZE + #define END_MEASURE_INITIALIZE(timer) \ + stopTimer(&(timer)->initialTimer); + #endif + + + #ifndef BEGIN_MEASURE_ONLINE + #define BEGIN_MEASURE_ONLINE(timer) \ + startTimer(&(timer)->online_timer); + #endif + + #ifndef END_MEASURE_ONLINE + #define END_MEASURE_ONLINE(timer) \ + stopTimer(&(timer)->online_timer); + #endif + + #ifndef BEGIN_MEASURE_ONLINE_ACC + #define BEGIN_MEASURE_ONLINE_ACC(timer) \ + startTimer(&(timer)->online_timer_pre); + #endif + */ #ifndef /*END_MEASURE_ONLINE_ACC*/ NO_TIMING -# define END_MEASURE_ONLINE_ACC(timer) \ - accTimer(&(timer)->online_timer_pre, \ - &(timer)->online_timer); // ONLINE one-pass absorbing data time +#define END_MEASURE_ONLINE_ACC(timer) \ + accTimer(&(timer)->online_timer_pre, \ + &(timer)->online_timer); // ONLINE one-pass absorbing data time #endif #ifndef BEGIN_MEASURE_INSERT_ACC -# define BEGIN_MEASURE_INSERT_ACC(timer) startTimer(&(timer)->dataInsertTimer_pre); +#define BEGIN_MEASURE_INSERT_ACC(timer) \ + startTimer(&(timer)->dataInsertTimer_pre); #endif #ifndef /*END_MEASURE_INSERT_ACC*/ NO_TIMING -# define END_MEASURE_INSERT_ACC(timer) \ - accTimer(&(timer)->dataInsertTimer_pre, &(timer)->dataInsertTimer); /* data insert time */ +#define END_MEASURE_INSERT_ACC(timer) \ + accTimer(&(timer)->dataInsertTimer_pre, \ + &(timer)->dataInsertTimer); /* data insert time */ #endif #ifndef BEGIN_MEASURE_CONDRIFT_ACC -# define BEGIN_MEASURE_CONDRIFT_ACC(timer) startTimer(&(timer)->conceptDriftTimer_pre); +#define BEGIN_MEASURE_CONDRIFT_ACC(timer) \ + startTimer(&(timer)->conceptDriftTimer_pre); #endif #ifndef /*END_MEASURE_CONDRIFT_ACC*/ NO_TIMING -# define END_MEASURE_CONDRIFT_ACC(timer) \ - accTimer(&(timer)->conceptDriftTimer_pre, \ - &(timer)->conceptDriftTimer); /* Concept drift time */ +#define END_MEASURE_CONDRIFT_ACC(timer) \ + accTimer(&(timer)->conceptDriftTimer_pre, \ + &(timer)->conceptDriftTimer); /* Concept drift time */ #endif #ifndef BEGIN_MEASURE_PRUNE_ACC -# define BEGIN_MEASURE_PRUNE_ACC(timer) startTimer(&(timer)->pruneTimer_pre); +#define BEGIN_MEASURE_PRUNE_ACC(timer) startTimer(&(timer)->pruneTimer_pre); #endif #ifndef /*END_MEASURE_PRUNE_ACC*/ NO_TIMING -# define END_MEASURE_PRUNE_ACC(timer) \ - accTimer(&(timer)->pruneTimer_pre, &(timer)->pruneTimer); /* Outlier prune time */ +#define END_MEASURE_PRUNE_ACC(timer) \ + accTimer(&(timer)->pruneTimer_pre, \ + &(timer)->pruneTimer); /* Outlier prune time */ #endif #ifndef BEGIN_MEASURE_SNAPSHOT_ACC -# define BEGIN_MEASURE_SNAPSHOT_ACC(timer) startTimer(&(timer)->snapshotTimer_pre); +#define BEGIN_MEASURE_SNAPSHOT_ACC(timer) \ + startTimer(&(timer)->snapshotTimer_pre); #endif #ifndef /*END_MEASURE_SNAPSHOT_ACC*/ NO_TIMING -# define END_MEASURE_SNAPSHOT_ACC(timer) \ - accTimer(&(timer)->snapshotTimer_pre, &(timer)->snapshotTimer); // Taking Snapshots time +#define END_MEASURE_SNAPSHOT_ACC(timer) \ + accTimer(&(timer)->snapshotTimer_pre, \ + &(timer)->snapshotTimer); // Taking Snapshots time #endif - /* - #ifndef BEGIN_MEASURE_REFINEMENT - #define BEGIN_MEASURE_REFINEMENT(timer) \ - startTimer(&(timer)->refinementTimer); - #endif - - #ifndef END_MEASURE_REFINEMENT - #define END_MEASURE_REFINEMENT(timer) \ - stopTimer(&(timer)->refinementTimer); - #endif - - - #ifndef OVERALL_START_MEASURE - #define OVERALL_START_MEASURE(timer) \ - gettimeofday(&(timer)->start, NULL); \ - startTimer(&(timer)->overall_timer); (\ - tim)er->partition_timer = 0; - #endif - - #ifndef OVERALL_END_MEASURE - #define OVERALL_END_MEASURE(timer) \ - stopTimer(&(timer)->overall_timer); \ - gettimeofday(&(timer)->end, NULL); - #endif - */ + /* + #ifndef BEGIN_MEASURE_REFINEMENT + #define BEGIN_MEASURE_REFINEMENT(timer) \ + startTimer(&(timer)->refinementTimer); + #endif + + #ifndef END_MEASURE_REFINEMENT + #define END_MEASURE_REFINEMENT(timer) \ + stopTimer(&(timer)->refinementTimer); + #endif + + + #ifndef OVERALL_START_MEASURE + #define OVERALL_START_MEASURE(timer) \ + gettimeofday(&(timer)->start, NULL); \ + startTimer(&(timer)->overall_timer); (\ + tim)er->partition_timer = 0; + #endif + + #ifndef OVERALL_END_MEASURE + #define OVERALL_END_MEASURE(timer) \ + stopTimer(&(timer)->overall_timer); \ + gettimeofday(&(timer)->end, NULL); + #endif + */ }; -} // namespace SESAME +} // namespace SESAME -#endif // SESAME_SRC_TIMER_CYCLEMETER_HPP_ +#endif // SESAME_SRC_TIMER_CYCLEMETER_HPP_ diff --git a/include/Timer/Timer.hpp b/include/Timer/Timer.hpp index 73aca981..75e4b38f 100644 --- a/include/Timer/Timer.hpp +++ b/include/Timer/Timer.hpp @@ -3,22 +3,21 @@ #include -struct Timer -{ - using clock_t = std::chrono::_V2::system_clock::time_point; - clock_t start; - int64_t sum = 0; - void Tick() { start = std::chrono::high_resolution_clock::now(); } - void Tock() - { - auto end = std::chrono::high_resolution_clock::now(); - sum += std::chrono::duration_cast(end - start).count(); - } - void Add(clock_t t) - { - auto now = std::chrono::high_resolution_clock::now(); - sum += std::chrono::duration_cast(now - t).count(); - } +struct Timer { + using clock_t = std::chrono::_V2::system_clock::time_point; + clock_t start; + int64_t sum = 0; + void Tick() { start = std::chrono::high_resolution_clock::now(); } + void Tock() { + auto end = std::chrono::high_resolution_clock::now(); + sum += std::chrono::duration_cast(end - start) + .count(); + } + void Add(clock_t t) { + auto now = std::chrono::high_resolution_clock::now(); + sum += + std::chrono::duration_cast(now - t).count(); + } }; #endif diff --git a/include/Timer/rdtsc.hpp b/include/Timer/rdtsc.hpp index 5f19b74d..638bc858 100644 --- a/include/Timer/rdtsc.hpp +++ b/include/Timer/rdtsc.hpp @@ -26,38 +26,36 @@ extern "C" { #endif #if !defined(__i386__) && !defined(__x86_64__) && !defined(__sparc__) -# warning No supported architecture found -- timers will return junk. +#warning No supported architecture found -- timers will return junk. #endif -static __inline__ uint64_t curtick() -{ - uint64_t tick; +static __inline__ uint64_t curtick() { + uint64_t tick; #if defined(__i386__) - unsigned long lo, hi; - __asm__ __volatile__(".byte 0x0f, 0x31" : "=a"(lo), "=d"(hi)); - tick = (uint64_t)hi << 32 | lo; + unsigned long lo, hi; + __asm__ __volatile__(".byte 0x0f, 0x31" : "=a"(lo), "=d"(hi)); + tick = (uint64_t)hi << 32 | lo; #elif defined(__x86_64__) - unsigned long lo, hi; - __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - tick = (uint64_t)hi << 32 | lo; + unsigned long lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + tick = (uint64_t)hi << 32 | lo; #elif defined(__sparc__) - __asm__ __volatile__("rd %%tick, %0" : "=r"(tick)); + __asm__ __volatile__("rd %%tick, %0" : "=r"(tick)); #endif - return tick; + return tick; } static __inline__ void startTimer(uint64_t *t) { *t = curtick(); } static __inline__ void stopTimer(uint64_t *t) { *t = curtick() - *t; } -static __inline__ void accTimer(uint64_t *pretimer, uint64_t *acctimer) -{ - *acctimer += curtick() - *pretimer; +static __inline__ void accTimer(uint64_t *pretimer, uint64_t *acctimer) { + *acctimer += curtick() - *pretimer; } -static __inline__ void accTimerMulti(uint64_t *pretimer, uint64_t *acctimer, int multiplier) -{ - *acctimer += (curtick() - *pretimer) * multiplier; +static __inline__ void accTimerMulti(uint64_t *pretimer, uint64_t *acctimer, + int multiplier) { + *acctimer += (curtick() - *pretimer) * multiplier; } #ifdef __cplusplus } diff --git a/include/Utils/BenchmarkUtils.hpp b/include/Utils/BenchmarkUtils.hpp index 95a42fd0..7c2caad3 100644 --- a/include/Utils/BenchmarkUtils.hpp +++ b/include/Utils/BenchmarkUtils.hpp @@ -11,11 +11,10 @@ #include "Algorithm/Param.hpp" #include "Evaluation/Evaluation.hpp" -namespace SESAME -{ +namespace SESAME { std::pair RunBenchmark(param_t ¶m); } -#endif // ONLINEMLBENCHMARK_BENCHMARK_SRC_UTIL_BENCHMARKUTILS_HPP_ +#endif // ONLINEMLBENCHMARK_BENCHMARK_SRC_UTIL_BENCHMARKUTILS_HPP_ diff --git a/include/Utils/Logger.hpp b/include/Utils/Logger.hpp index 4ab109ca..e70aca1a 100644 --- a/include/Utils/Logger.hpp +++ b/include/Utils/Logger.hpp @@ -10,14 +10,7 @@ // TRACE < DEBUG < INFO < WARN < ERROR < FATAL #include -enum DebugLevel -{ - LOG_NONE, - LOG_WARNING, - LOG_DEBUG, - LOG_INFO, - LOG_TRACE -}; +enum DebugLevel { LOG_NONE, LOG_WARNING, LOG_DEBUG, LOG_INFO, LOG_TRACE }; #define LEVEL_TRACE 6 #define LEVEL_DEBUG 5 @@ -28,15 +21,15 @@ enum DebugLevel #define SESAME_TRACE(TEXT) std::cerr << TEXT << std::endl; #ifndef NDEBUG -# define SESAME_DEBUG(TEXT) std::cerr << TEXT << std::endl; -# define SESAME_INFO(TEXT) std::cerr << TEXT << std::endl; +#define SESAME_DEBUG(TEXT) std::cerr << TEXT << std::endl; +#define SESAME_INFO(TEXT) std::cerr << TEXT << std::endl; #else -# define SESAME_DEBUG(TEXT) ; -# define SESAME_INFO(TEXT) ; +#define SESAME_DEBUG(TEXT) ; +#define SESAME_INFO(TEXT) ; #endif #define SESAME_TRACE(TEXT) std::cerr << TEXT << std::endl; #define SESAME_WARNING(TEXT) std::cerr << TEXT << std::endl; #define SESAME_ERROR(TEXT) std::cerr << TEXT << std::endl; #define SESAME_FATAL_ERROR(TEXT) std::cerr << TEXT << std::endl; -#endif // SESAME_INCLUDE_UTILS_LOGGER_HPP_ +#endif // SESAME_INCLUDE_UTILS_LOGGER_HPP_ diff --git a/include/Utils/Random.hpp b/include/Utils/Random.hpp index dfc2540e..1bc1bd8f 100644 --- a/include/Utils/Random.hpp +++ b/include/Utils/Random.hpp @@ -3,28 +3,24 @@ #include -class Random -{ +class Random { private: - std::mt19937 r; + std::mt19937 r; public: - Random(int seed = 0) : r(seed) {} - int random_uniform(int min, int max) - { - std::uniform_int_distribution dist(min, max); - return dist(r); - } - double random_uniform(double min, double max) - { - std::uniform_real_distribution dist(min, max); - return dist(r); - } - bool bernoulli(double p) - { - std::bernoulli_distribution dist(p); - return dist(r); - } + Random(int seed = 0) : r(seed) {} + int random_uniform(int min, int max) { + std::uniform_int_distribution dist(min, max); + return dist(r); + } + double random_uniform(double min, double max) { + std::uniform_real_distribution dist(min, max); + return dist(r); + } + bool bernoulli(double p) { + std::bernoulli_distribution dist(p); + return dist(r); + } }; #endif \ No newline at end of file diff --git a/include/Utils/SPSCQueue.hpp b/include/Utils/SPSCQueue.hpp index 7b9a6e52..d8972db2 100644 --- a/include/Utils/SPSCQueue.hpp +++ b/include/Utils/SPSCQueue.hpp @@ -1,226 +1,207 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) #pragma once #include #include #include -#include // std::allocator -#include // std::hardware_destructive_interference_size +#include // std::allocator +#include // std::hardware_destructive_interference_size #include -#include // std::enable_if, std::is_*_constructible +#include // std::enable_if, std::is_*_constructible -namespace rigtorp -{ -template > -class SPSCQueue -{ +namespace rigtorp { +template > class SPSCQueue { #if defined(__cpp_if_constexpr) && defined(__cpp_lib_void_t) - template - struct has_allocate_at_least : std::false_type - {}; - - template - struct has_allocate_at_least< - Alloc2, std::void_t().allocate_at_least(size_t{}))>> - : std::true_type - {}; + template + struct has_allocate_at_least : std::false_type {}; + + template + struct has_allocate_at_least< + Alloc2, std::void_t().allocate_at_least( + size_t{}))>> : std::true_type {}; #endif public: - explicit SPSCQueue(const size_t capacity, const Allocator &allocator = Allocator()) - : capacity_(capacity), allocator_(allocator) - { - // The queue needs at least one element - if (capacity_ < 1) - { - capacity_ = 1; - } - capacity_++; // Needs one slack element - // Prevent overflowing size_t - if (capacity_ > SIZE_MAX - 2 * kPadding) - { - capacity_ = SIZE_MAX - 2 * kPadding; - } + explicit SPSCQueue(const size_t capacity, + const Allocator &allocator = Allocator()) + : capacity_(capacity), allocator_(allocator) { + // The queue needs at least one element + if (capacity_ < 1) { + capacity_ = 1; + } + capacity_++; // Needs one slack element + // Prevent overflowing size_t + if (capacity_ > SIZE_MAX - 2 * kPadding) { + capacity_ = SIZE_MAX - 2 * kPadding; + } #if defined(__cpp_if_constexpr) && defined(__cpp_lib_void_t) - if constexpr (has_allocate_at_least::value) - { - auto res = allocator_.allocate_at_least(capacity_ + 2 * kPadding); - slots_ = res.ptr; - capacity_ = res.count - 2 * kPadding; - } - else - { - slots_ = - std::allocator_traits::allocate(allocator_, capacity_ + 2 * kPadding); - } + if constexpr (has_allocate_at_least::value) { + auto res = allocator_.allocate_at_least(capacity_ + 2 * kPadding); + slots_ = res.ptr; + capacity_ = res.count - 2 * kPadding; + } else { + slots_ = std::allocator_traits::allocate( + allocator_, capacity_ + 2 * kPadding); + } #else - slots_ = std::allocator_traits::allocate(allocator_, capacity_ + 2 * kPadding); + slots_ = std::allocator_traits::allocate( + allocator_, capacity_ + 2 * kPadding); #endif - static_assert(alignof(SPSCQueue) == kCacheLineSize, ""); - static_assert(sizeof(SPSCQueue) >= 3 * kCacheLineSize, ""); - assert(reinterpret_cast(&readIdx_) - reinterpret_cast(&writeIdx_) >= - static_cast(kCacheLineSize)); - } + static_assert(alignof(SPSCQueue) == kCacheLineSize, ""); + static_assert(sizeof(SPSCQueue) >= 3 * kCacheLineSize, ""); + assert(reinterpret_cast(&readIdx_) - + reinterpret_cast(&writeIdx_) >= + static_cast(kCacheLineSize)); + } - ~SPSCQueue() - { - while (front()) - { - pop(); - } - std::allocator_traits::deallocate(allocator_, slots_, capacity_ + 2 * kPadding); + ~SPSCQueue() { + while (front()) { + pop(); } - - // non-copyable and non-movable - SPSCQueue(const SPSCQueue &) = delete; - - SPSCQueue &operator=(const SPSCQueue &) = delete; - - template - void emplace(Args &&...args) noexcept(std::is_nothrow_constructible::value) - { - static_assert(std::is_constructible::value, - "T must be constructible with Args&&..."); - auto const writeIdx = writeIdx_.load(std::memory_order_relaxed); - auto nextWriteIdx = writeIdx + 1; - if (nextWriteIdx == capacity_) - { - nextWriteIdx = 0; - } - while (nextWriteIdx == readIdxCache_) - { - readIdxCache_ = readIdx_.load(std::memory_order_acquire); - } - new (&slots_[writeIdx + kPadding]) T(std::forward(args)...); - writeIdx_.store(nextWriteIdx, std::memory_order_release); + std::allocator_traits::deallocate(allocator_, slots_, + capacity_ + 2 * kPadding); + } + + // non-copyable and non-movable + SPSCQueue(const SPSCQueue &) = delete; + + SPSCQueue &operator=(const SPSCQueue &) = delete; + + template + void emplace(Args &&...args) noexcept( + std::is_nothrow_constructible::value) { + static_assert(std::is_constructible::value, + "T must be constructible with Args&&..."); + auto const writeIdx = writeIdx_.load(std::memory_order_relaxed); + auto nextWriteIdx = writeIdx + 1; + if (nextWriteIdx == capacity_) { + nextWriteIdx = 0; } - - template - bool try_emplace(Args &&...args) noexcept(std::is_nothrow_constructible::value) - { - static_assert(std::is_constructible::value, - "T must be constructible with Args&&..."); - auto const writeIdx = writeIdx_.load(std::memory_order_relaxed); - auto nextWriteIdx = writeIdx + 1; - if (nextWriteIdx == capacity_) - { - nextWriteIdx = 0; - } - if (nextWriteIdx == readIdxCache_) - { - readIdxCache_ = readIdx_.load(std::memory_order_acquire); - if (nextWriteIdx == readIdxCache_) - { - return false; - } - } - new (&slots_[writeIdx + kPadding]) T(std::forward(args)...); - writeIdx_.store(nextWriteIdx, std::memory_order_release); - return true; + while (nextWriteIdx == readIdxCache_) { + readIdxCache_ = readIdx_.load(std::memory_order_acquire); } - - void push(const T &v) noexcept(std::is_nothrow_copy_constructible::value) - { - static_assert(std::is_copy_constructible::value, "T must be copy constructible"); - emplace(v); + new (&slots_[writeIdx + kPadding]) T(std::forward(args)...); + writeIdx_.store(nextWriteIdx, std::memory_order_release); + } + + template + bool try_emplace(Args &&...args) noexcept( + std::is_nothrow_constructible::value) { + static_assert(std::is_constructible::value, + "T must be constructible with Args&&..."); + auto const writeIdx = writeIdx_.load(std::memory_order_relaxed); + auto nextWriteIdx = writeIdx + 1; + if (nextWriteIdx == capacity_) { + nextWriteIdx = 0; } - - template ::value>::type> - void push(P &&v) noexcept(std::is_nothrow_constructible::value) - { - emplace(std::forward

(v)); + if (nextWriteIdx == readIdxCache_) { + readIdxCache_ = readIdx_.load(std::memory_order_acquire); + if (nextWriteIdx == readIdxCache_) { + return false; + } } - - bool try_push(const T &v) noexcept(std::is_nothrow_copy_constructible::value) - { - static_assert(std::is_copy_constructible::value, "T must be copy constructible"); - return try_emplace(v); - } - - template ::value>::type> - bool try_push(P &&v) noexcept(std::is_nothrow_constructible::value) - { - return try_emplace(std::forward

(v)); + new (&slots_[writeIdx + kPadding]) T(std::forward(args)...); + writeIdx_.store(nextWriteIdx, std::memory_order_release); + return true; + } + + void push(const T &v) noexcept(std::is_nothrow_copy_constructible::value) { + static_assert(std::is_copy_constructible::value, + "T must be copy constructible"); + emplace(v); + } + + template ::value>::type> + void push(P &&v) noexcept(std::is_nothrow_constructible::value) { + emplace(std::forward

(v)); + } + + bool + try_push(const T &v) noexcept(std::is_nothrow_copy_constructible::value) { + static_assert(std::is_copy_constructible::value, + "T must be copy constructible"); + return try_emplace(v); + } + + template ::value>::type> + bool try_push(P &&v) noexcept(std::is_nothrow_constructible::value) { + return try_emplace(std::forward

(v)); + } + + T *front() noexcept { + auto const readIdx = readIdx_.load(std::memory_order_relaxed); + if (readIdx == writeIdxCache_) { + writeIdxCache_ = writeIdx_.load(std::memory_order_acquire); + if (writeIdxCache_ == readIdx) { + return nullptr; + } } - - T *front() noexcept - { - auto const readIdx = readIdx_.load(std::memory_order_relaxed); - if (readIdx == writeIdxCache_) - { - writeIdxCache_ = writeIdx_.load(std::memory_order_acquire); - if (writeIdxCache_ == readIdx) - { - return nullptr; - } - } - return &slots_[readIdx + kPadding]; + return &slots_[readIdx + kPadding]; + } + + void pop() noexcept { + static_assert(std::is_nothrow_destructible::value, + "T must be nothrow destructible"); + auto const readIdx = readIdx_.load(std::memory_order_relaxed); + assert(writeIdx_.load(std::memory_order_acquire) != readIdx); + slots_[readIdx + kPadding].~T(); + auto nextReadIdx = readIdx + 1; + if (nextReadIdx == capacity_) { + nextReadIdx = 0; } - - void pop() noexcept - { - static_assert(std::is_nothrow_destructible::value, "T must be nothrow destructible"); - auto const readIdx = readIdx_.load(std::memory_order_relaxed); - assert(writeIdx_.load(std::memory_order_acquire) != readIdx); - slots_[readIdx + kPadding].~T(); - auto nextReadIdx = readIdx + 1; - if (nextReadIdx == capacity_) - { - nextReadIdx = 0; - } - readIdx_.store(nextReadIdx, std::memory_order_release); - } - - size_t size() const noexcept - { - std::ptrdiff_t diff = - writeIdx_.load(std::memory_order_acquire) - readIdx_.load(std::memory_order_acquire); - if (diff < 0) - { - diff += capacity_; - } - return static_cast(diff); + readIdx_.store(nextReadIdx, std::memory_order_release); + } + + size_t size() const noexcept { + std::ptrdiff_t diff = writeIdx_.load(std::memory_order_acquire) - + readIdx_.load(std::memory_order_acquire); + if (diff < 0) { + diff += capacity_; } + return static_cast(diff); + } - bool empty() const noexcept { return size() == 0; } + bool empty() const noexcept { return size() == 0; } - size_t capacity() const noexcept { return capacity_ - 1; } + size_t capacity() const noexcept { return capacity_ - 1; } private: #ifdef __cpp_lib_hardware_interference_size - static constexpr size_t kCacheLineSize = std::hardware_destructive_interference_size; + static constexpr size_t kCacheLineSize = + std::hardware_destructive_interference_size; #else - static constexpr size_t kCacheLineSize = 64; + static constexpr size_t kCacheLineSize = 64; #endif - // Padding to avoid false sharing between slots_ and adjacent allocations - static constexpr size_t kPadding = (kCacheLineSize - 1) / sizeof(T) + 1; + // Padding to avoid false sharing between slots_ and adjacent allocations + static constexpr size_t kPadding = (kCacheLineSize - 1) / sizeof(T) + 1; private: - size_t capacity_; - T *slots_; + size_t capacity_; + T *slots_; #if defined(__has_cpp_attribute) && __has_cpp_attribute(no_unique_address) - Allocator allocator_ [[no_unique_address]]; + Allocator allocator_ [[no_unique_address]]; #else - Allocator allocator_; + Allocator allocator_; #endif - // Align to cache line size in order to avoid false sharing - // readIdxCache_ and writeIdxCache_ is used to reduce the amount of cache - // coherency traffic - alignas(kCacheLineSize) std::atomic writeIdx_ = {0}; - alignas(kCacheLineSize) size_t readIdxCache_ = 0; - alignas(kCacheLineSize) std::atomic readIdx_ = {0}; - alignas(kCacheLineSize) size_t writeIdxCache_ = 0; - - // Padding to avoid adjacent allocations to share cache line with - // writeIdxCache_ - char padding_[kCacheLineSize - sizeof(writeIdxCache_)]; + // Align to cache line size in order to avoid false sharing + // readIdxCache_ and writeIdxCache_ is used to reduce the amount of cache + // coherency traffic + alignas(kCacheLineSize) std::atomic writeIdx_ = {0}; + alignas(kCacheLineSize) size_t readIdxCache_ = 0; + alignas(kCacheLineSize) std::atomic readIdx_ = {0}; + alignas(kCacheLineSize) size_t writeIdxCache_ = 0; + + // Padding to avoid adjacent allocations to share cache line with + // writeIdxCache_ + char padding_[kCacheLineSize - sizeof(writeIdxCache_)]; }; -} // namespace rigtorp \ No newline at end of file +} // namespace rigtorp \ No newline at end of file diff --git a/include/Utils/Types.hpp b/include/Utils/Types.hpp new file mode 100644 index 00000000..861c6405 --- /dev/null +++ b/include/Utils/Types.hpp @@ -0,0 +1,26 @@ +#ifndef SESAME_INCLUDE_UTILS_TYPES_HPP_ +#define SESAME_INCLUDE_UTILS_TYPES_HPP_ + +#include +#include + +namespace SESAME { +using int8 = std::int8_t; +using int16 = std::int16_t; +using int32 = std::int32_t; +using int64 = std::int64_t; + +using uint8 = std::uint8_t; +using uint16 = std::uint16_t; +using uint32 = std::uint32_t; +using uint64 = std::uint64_t; + +using fp32 = float; +using fp64 = double; + +using clock_t = std::chrono::_V2::system_clock::time_point; +using feature_t = fp64; + +} // namespace SESAME + +#endif \ No newline at end of file diff --git a/include/Utils/UtilityFunctions.hpp b/include/Utils/UtilityFunctions.hpp index 2a7e8aa8..005b3a29 100644 --- a/include/Utils/UtilityFunctions.hpp +++ b/include/Utils/UtilityFunctions.hpp @@ -17,53 +17,51 @@ #include using uint64 = unsigned long long; -using int64 = long long; +using int64 = long long; /* Period parameters */ -const int N = 624; -const int M = 397; -const uint64 MATRIX_A = 0x9908b0dfUL; /* constant vector a */ -const uint64 UPPER_MASK = 0x80000000UL; /* most significant w-r bits */ -const uint64 LOWER_MASK = 0x7fffffffUL; /* least significant r bits */ -const int TRUE = 1; -const int FALSE = 0; -const int DEFAULT_WEIGHT = 1; -const int DEFAULT_COST = 0; +const int N = 624; +const int M = 397; +const uint64 MATRIX_A = 0x9908b0dfUL; /* constant vector a */ +const uint64 UPPER_MASK = 0x80000000UL; /* most significant w-r bits */ +const uint64 LOWER_MASK = 0x7fffffffUL; /* least significant r bits */ +const int TRUE = 1; +const int FALSE = 0; +const int DEFAULT_WEIGHT = 1; +const int DEFAULT_COST = 0; const int DEFAULT_QUEUE_CAPACITY = 3000; -const int KMEANS_TIMES = 5; -const int CMM_KNN = 10; -const double CMM_A = 0.998; -const int CMM_LAMDA = 1; -const int CMM_THRESHOLD = 542; -const int UNCLASSIFIED = (-2); -const int CORE_POINT = 1; -const int NOISE = (-1); -const int SUCCESS = 0; -const int FAILURE = (-3); +const int KMEANS_TIMES = 5; +const int CMM_KNN = 10; +const double CMM_A = 0.998; +const int CMM_LAMDA = 1; +const int CMM_THRESHOLD = 542; +const int UNCLASSIFIED = (-2); +const int CORE_POINT = 1; +const int NOISE = (-1); +const int SUCCESS = 0; +const int FAILURE = (-3); /* Determines when Lloyd terminates (should be between 0 and 1) */ const double THRESHOLD = 1.000; -namespace SESAME -{ +namespace SESAME { typedef std::shared_ptr> BarrierPtr; -class UtilityFunctions -{ +class UtilityFunctions { public: - UtilityFunctions(); - static void init_genrand(unsigned long s); - static double genrand_real3(); - static long genrand_int31(void); - static unsigned long genrand_int32(void); - static std::shared_ptr> createBarrier(int count); - static void groupByCenters(const std::vector &input, - const std::vector ¢ers, std::vector &output, - int dim); - static void groupByCentersWithOffline(const std::vector &input, - const std::vector ¢ers, - std::vector &output, int dim); + UtilityFunctions(); + static void init_genrand(unsigned long s); + static double genrand_real3(); + static long genrand_int31(void); + static unsigned long genrand_int32(void); + static std::shared_ptr> createBarrier(int count); + static void groupByCenters(const std::vector &input, + const std::vector ¢ers, + std::vector &output, int dim); + static void groupByCentersWithOffline(const std::vector &input, + const std::vector ¢ers, + std::vector &output, int dim); }; std::string getExecutablePath(); @@ -71,5 +69,5 @@ std::string getExecutableDir(); std::string mergePaths(std::string pathA, std::string pathB); bool checkIfFileExists(const std::string &filePath); -} // namespace SESAME -#endif // SESAME_SRC_UTILS_UTILITYFUNCTIONS_HPP_ +} // namespace SESAME +#endif // SESAME_SRC_UTILS_UTILITYFUNCTIONS_HPP_ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..9a76f229 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["setuptools>=61.0", "pybind11>=2.12.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pysame" +version = "0.1.0" +authors = [{ name = "Zhengru Wang", email = "wangshaun@outlook.com" }] +description = "Stream clustering algorithms on modern hardware" +readme = "README.md" +requires-python = ">=3.7" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +[project.urls] +Homepage = "https://github.com/intellistream/Sesame" +Issues = "https://github.com/intellistream/Sesame/issues" diff --git a/python/README.md b/python/README.md index 527a18d2..2848bb9b 100644 --- a/python/README.md +++ b/python/README.md @@ -1,3 +1,22 @@ # Python -Python API binding \ No newline at end of file +Python API binding for sesame + +## How to build pypi wheel + +```bash +docker run -it --rm -v $(pwd):/Sesame quay.io/pypa/manylinux_2_28_x86_64 /bin/bash + +# run in docker bash + +cd Sesame +for v in 3.7 3.8 3.9 3.10 3.11 3.12 3.13 +do +python$v -m pip install build +python$v -m build +done + +auditwheel repair dist/pysame-*-linux_x86_64.whl +python3.9 -m pip install twine +python3.9 -m twine upload --repository testpypi wheelhouse/* +``` \ No newline at end of file diff --git a/python/demo.py b/python/demo.py index 817b8404..b4e4db7c 100644 --- a/python/demo.py +++ b/python/demo.py @@ -1,18 +1,15 @@ -import benne +#!python3 -# Create an instance of Parameters -params = benne.Param() +from pysame import Benne, Birch +# from sklearn.cluster import Birch -# Get and set the algorithm -algorithm = params.algo +X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]] -print("Current algorithm:", algorithm) -params.algo = benne.AlgoType.BIRCH -params.input_file = "/home/shaun/Sesame/datasets/CoverType.txt" +brc = Birch( + n_clusters=2, + dim=2, + threshold=0.5, +) -# ... Continue getting and setting other parameters -# Accessing docstring -print(benne.Param.algo.__doc__) - -# Run the SESAME algorithm -benne.run() +print(brc.partial_fit(X).predict(X)) +# print(brc) \ No newline at end of file diff --git a/python/src/Pysame.cpp b/python/src/Pysame.cpp new file mode 100644 index 00000000..305a7584 --- /dev/null +++ b/python/src/Pysame.cpp @@ -0,0 +1,37 @@ +#include "Python/Benne.hpp" +#include "Python/Birch.hpp" + +namespace py = pybind11; +using namespace py::literals; +using namespace SESAME; +using namespace std; + +PYBIND11_MODULE(pysame, m) { + py::class_(m, "Benne") + .def(py::init(), + py::arg("dim") = 2, py::arg("n_clusters") = 3, + py::arg("threshold") = 0.5, py::arg("queue_size_threshold") = 100, + py::arg("dim_threshold") = 2, py::arg("variance_threshold") = 0.5, + py::arg("outliers_num_threshold") = 10, + py::arg("outliers_dist_threshold") = 0.5) + .def("fit", &Benne::fit, py::return_value_policy::reference_internal) + .def("fit_predict", &Benne::fit_predict) + .def("get_params", &Benne::get_params) + .def("partial_fit", &Benne::partial_fit, + py::return_value_policy::reference_internal) + .def("predict", &Benne::predict) + .def("set_output", &Benne::set_output) + .def("set_params", &Benne::set_params); + py::class_(m, "Birch") + .def(py::init(), py::arg("dim") = 2, + py::arg("n_clusters") = 3, py::arg("threshold") = 0.5, + py::arg("branching_factor") = 50) + .def("fit", &Birch::fit, py::return_value_policy::reference_internal) + .def("fit_predict", &Birch::fit_predict) + .def("get_params", &Birch::get_params) + .def("partial_fit", &Birch::partial_fit, + py::return_value_policy::reference_internal) + .def("predict", &Birch::predict) + .def("set_output", &Birch::set_output) + .def("set_params", &Birch::set_params); +} \ No newline at end of file diff --git a/python/src/bindings.cpp b/python/src/bindings.cpp deleted file mode 100644 index 5a98f248..00000000 --- a/python/src/bindings.cpp +++ /dev/null @@ -1,436 +0,0 @@ -// Copyright (C) 2021 by the IntelliStream team -// (https://github.com/intellistream) - -#include -#include -#include -#include "Algorithm/AlgorithmFactory.hpp" -#include "Algorithm/DataStructure/GenericFactory.hpp" -#include "Sinks/DataSinkFactory.hpp" -#include "Sources/DataSourceFactory.hpp" -#include "Utils/BenchmarkUtils.hpp" -#include "const.h" - -using namespace std::filesystem; -using namespace std; -using namespace SESAME; - -namespace py = pybind11; - -// Define a global variable to hold the parameters -param_t &GetGlobalParam() { - static param_t param; - return param; -} - -// Wrapper class for param_t -class Param { -public: - - Param() { - GetGlobalParam().algo = BirchType; - GetGlobalParam().num_points = 542; - GetGlobalParam().dim = 54; - GetGlobalParam().num_clusters = 2; - GetGlobalParam().max_in_nodes = 3; // Maximum number of internal nodes - GetGlobalParam().max_leaf_nodes = 3; // Maximum number of leaf nodes - GetGlobalParam().distance_threshold = 3550.0; // Distance threshold - GetGlobalParam().seed = 1; // Seed for random number generator - GetGlobalParam().coreset_size = 100; // Coreset size - GetGlobalParam().radius = 0.1; // Radius - GetGlobalParam().delta = 10.0; // Delta - GetGlobalParam().beta = 0.0021; // Beta - GetGlobalParam().buf_size = 500; // Buffer size - GetGlobalParam().alpha = 0.998; // Alpha - GetGlobalParam().lambda = 1.0; // Lambda - GetGlobalParam().clean_interval = 2500; // Clean interval - GetGlobalParam().min_weight = 0.5; // Minimum weight - GetGlobalParam().base = 2; // Base - GetGlobalParam().cm = 5.0; // Cm - GetGlobalParam().cl = 0.8; // Cl - GetGlobalParam().grid_width = 12.0; // Grid width - GetGlobalParam().min_points = 10; // Minimum points - GetGlobalParam().epsilon = 50.0; // Epsilon - GetGlobalParam().mu = 7.0; // Mu - GetGlobalParam().num_last_arr = 60; // Number of last arrive - GetGlobalParam().time_window = 6; // Time window - GetGlobalParam().num_online_clusters = 10; // Number of online clusters - GetGlobalParam().delta_grid = 0.2; // The delta parameter used in the grid for guessing the optimum - GetGlobalParam().num_samples = 100; // Number of samples - GetGlobalParam().landmark = 1000; // Landmark - GetGlobalParam().sliding = 1000; // Sliding - GetGlobalParam().outlier_distance_threshold = 1000; // Outlier distance threshold - GetGlobalParam().outlier_cap = 100; // Outlier cap - GetGlobalParam().outlier_density_threshold = 100; // Outlier density threshold - GetGlobalParam().neighbor_distance = 200; // Neighbor distance - GetGlobalParam().k = 0; // KMeans K - GetGlobalParam().arr_rate = 0; // Arrival rate - GetGlobalParam().run_offline = true; // Whether to run offline clustering - GetGlobalParam().run_eval = true; // Whether to run evaluation - GetGlobalParam().run_cmm = true; // Whether to run CMM evaluation - GetGlobalParam().run_pur = true; // Whether to run Purity evaluation - GetGlobalParam().time_interval = 100; - GetGlobalParam().offline_time_window = 0; - GetGlobalParam().opt = 2; - GetGlobalParam().input_file = "datasets/CoverType.txt"; - GetGlobalParam().output_file = "results.txt"; - if (GetGlobalParam().algo == BirchType) { - GetGlobalParam().outlier_cap = std::numeric_limits::min(); - } - } - - - // AlgoType - AlgoType algo() const { return GetGlobalParam().algo; } - - void set_algo(AlgoType value) { GetGlobalParam().algo = value; } - - // Dataset - std::string input_file() const { return GetGlobalParam().input_file; } - - void set_input_file(const std::string &value) { GetGlobalParam().input_file = value; } - - int num_points() const { return GetGlobalParam().num_points; } - - void set_num_points(int value) { GetGlobalParam().num_points = value; } - - int dim() const { return GetGlobalParam().dim; } - - void set_dim(int value) { GetGlobalParam().dim = value; } - - int num_clusters() const { return GetGlobalParam().num_clusters; } - - void set_num_clusters(int value) { GetGlobalParam().num_clusters = value; } - - // BIRCH - int max_in_nodes() const { return GetGlobalParam().max_in_nodes; } - - void set_max_in_nodes(int value) { GetGlobalParam().max_in_nodes = value; } - - int max_leaf_nodes() const { return GetGlobalParam().max_leaf_nodes; } - - void set_max_leaf_nodes(int value) { GetGlobalParam().max_leaf_nodes = value; } - - double distance_threshold() const { return GetGlobalParam().distance_threshold; } - - void set_distance_threshold(double value) { GetGlobalParam().distance_threshold = value; } - - // StreamKM++ - int seed() const { return GetGlobalParam().seed; } - - void set_seed(int value) { GetGlobalParam().seed = value; } - - int coreset_size() const { return GetGlobalParam().coreset_size; } - - void set_coreset_size(int value) { GetGlobalParam().coreset_size = value; } - - // EDMStream - double radius() const { return GetGlobalParam().radius; } - - void set_radius(double value) { GetGlobalParam().radius = value; } - - double delta() const { return GetGlobalParam().delta; } - - void set_delta(double value) { GetGlobalParam().delta = value; } - - double beta() const { return GetGlobalParam().beta; } - - void set_beta(double value) { GetGlobalParam().beta = value; } - - int buf_size() const { return GetGlobalParam().buf_size; } - - void set_buf_size(int value) { GetGlobalParam().buf_size = value; } - - double alpha() const { return GetGlobalParam().alpha; } - - void set_alpha(double value) { GetGlobalParam().alpha = value; } - - double lambda() const { return GetGlobalParam().lambda; } - - void set_lambda(double value) { GetGlobalParam().lambda = value; } - - // DBStream - int clean_interval() const { return GetGlobalParam().clean_interval; } - - void set_clean_interval(int value) { GetGlobalParam().clean_interval = value; } - - double min_weight() const { return GetGlobalParam().min_weight; } - - void set_min_weight(double value) { GetGlobalParam().min_weight = value; } - - double base() const { return GetGlobalParam().base; } - - void set_base(double value) { GetGlobalParam().base = value; } - - // DStream - double cm() const { return GetGlobalParam().cm; } - - void set_cm(double value) { GetGlobalParam().cm = value; } - - double cl() const { return GetGlobalParam().cl; } - - void set_cl(double value) { GetGlobalParam().cl = value; } - - double grid_width() const { return GetGlobalParam().grid_width; } - - void set_grid_width(double value) { GetGlobalParam().grid_width = value; } - - // DenStream - unsigned int min_points() const { return GetGlobalParam().min_points; } - - void set_min_points(unsigned int value) { GetGlobalParam().min_points = value; } - - double epsilon() const { return GetGlobalParam().epsilon; } - - void set_epsilon(double value) { GetGlobalParam().epsilon = value; } - - double mu() const { return GetGlobalParam().mu; } - - void set_mu(double value) { GetGlobalParam().mu = value; } - - // Clustream - int num_last_arr() const { return GetGlobalParam().num_last_arr; } - - void set_num_last_arr(int value) { GetGlobalParam().num_last_arr = value; } - - int time_window() const { return GetGlobalParam().time_window; } - - void set_time_window(int value) { GetGlobalParam().time_window = value; } - - int num_online_clusters() const { return GetGlobalParam().num_online_clusters; } - - void set_num_online_clusters(int value) { GetGlobalParam().num_online_clusters = value; } - - // SL-KMeans - double delta_grid() const { return GetGlobalParam().delta_grid; } - - void set_delta_grid(double value) { GetGlobalParam().delta_grid = value; } - - int num_samples() const { return GetGlobalParam().num_samples; } - - void set_num_samples(int value) { GetGlobalParam().num_samples = value; } - - // Generic - int landmark() const { return GetGlobalParam().landmark; } - - void set_landmark(int value) { GetGlobalParam().landmark = value; } - - int sliding() const { return GetGlobalParam().sliding; } - - void set_sliding(int value) { GetGlobalParam().sliding = value; } - - double outlier_distance_threshold() const { return GetGlobalParam().outlier_distance_threshold; } - - void set_outlier_distance_threshold(double value) { GetGlobalParam().outlier_distance_threshold = value; } - - int outlier_cap() const { return GetGlobalParam().outlier_cap; } - - void set_outlier_cap(int value) { GetGlobalParam().outlier_cap = value; } - - double outlier_density_threshold() const { return GetGlobalParam().outlier_density_threshold; } - - void set_outlier_density_threshold(double value) { GetGlobalParam().outlier_density_threshold = value; } - - double neighbor_distance() const { return GetGlobalParam().neighbor_distance; } - - void set_neighbor_distance(double value) { GetGlobalParam().neighbor_distance = value; } - - int k() const { return GetGlobalParam().k; } - - void set_k(int value) { GetGlobalParam().k = value; } - - int arr_rate() const { return GetGlobalParam().arr_rate; } - - void set_arr_rate(int value) { GetGlobalParam().arr_rate = value; } - - bool run_offline() const { return GetGlobalParam().run_offline; } - - void set_run_offline(bool value) { GetGlobalParam().run_offline = value; } - - bool run_eval() const { return GetGlobalParam().run_eval; } - - void set_run_eval(bool value) { GetGlobalParam().run_eval = value; } - - bool run_cmm() const { return GetGlobalParam().run_cmm; } - - void set_run_cmm(bool value) { GetGlobalParam().run_cmm = value; } - - bool run_pur() const { return GetGlobalParam().run_pur; } - - void set_run_pur(bool value) { GetGlobalParam().run_pur = value; } - - void Print() { GetGlobalParam().Print(); } - - // Add other properties for the member variables of param_t - - // num_res - int num_res() const { return GetGlobalParam().num_res; } - - void set_num_res(int value) { GetGlobalParam().num_res = value; } - - // Define docstrings for each parameter - static constexpr const char *algo_doc = "Algorithm to use"; - static constexpr const char *input_file_doc = "Input file path"; - static constexpr const char *num_points_doc = "Number of points"; - static constexpr const char *dim_doc = "Dimension of points"; - static constexpr const char *num_clusters_doc = "Number of clusters"; - // BIRCH - static constexpr const char *max_in_nodes_doc = "Maximum number of internal nodes"; - static constexpr const char *max_leaf_nodes_doc = "Maximum number of leaf nodes"; - static constexpr const char *distance_threshold_doc = "Distance threshold"; - // StreamKM++ - static constexpr const char *seed_doc = "Seed for random number generator"; - static constexpr const char *coreset_size_doc = "Coreset size"; - // EDMStream - static constexpr const char *radius_doc = "Radius"; - static constexpr const char *delta_doc = "Delta"; - static constexpr const char *beta_doc = "Beta"; - static constexpr const char *buf_size_doc = "Buffer size"; - static constexpr const char *alpha_doc = "Alpha"; - static constexpr const char *lambda_doc = "Lambda"; - // DBStream - static constexpr const char *clean_interval_doc = "Clean interval"; - static constexpr const char *min_weight_doc = "Minimum weight"; - static constexpr const char *base_doc = "Base"; - // DStream - static constexpr const char *cm_doc = "Cm"; - static constexpr const char *cl_doc = "Cl"; - static constexpr const char *grid_width_doc = "Grid width"; - // DenStream - static constexpr const char *min_points_doc = "Minimum points"; - static constexpr const char *epsilon_doc = "Epsilon"; - static constexpr const char *mu_doc = "Mu"; - // Clustream - static constexpr const char *num_last_arr_doc = "Number of last arrive"; - static constexpr const char *time_window_doc = "Time window"; - static constexpr const char *num_online_clusters_doc = "Number of online clusters"; - // SL-KMeans - static constexpr const char *delta_grid_doc = "The delta parameter used in the grid for guessing the optimum."; - static constexpr const char *num_samples_doc = "Number of samples"; - // Generic - static constexpr const char *landmark_doc = "Landmark"; - static constexpr const char *sliding_doc = "Sliding"; - static constexpr const char *outlier_distance_threshold_doc = "Outlier distance threshold"; - static constexpr const char *outlier_cap_doc = "Outlier cap"; - static constexpr const char *outlier_density_threshold_doc = "Outlier density threshold"; - static constexpr const char *neighbor_distance_doc = "Neighbor distance"; - static constexpr const char *k_doc = "KMeans K"; - static constexpr const char *arr_rate_doc = "Arrival rate"; - static constexpr const char *run_offline_doc = "Whether to run offline clustering"; - static constexpr const char *run_eval_doc = "Whether to run evaluation"; - static constexpr const char *run_cmm_doc = "Whether to run CMM evaluation"; - static constexpr const char *run_pur_doc = "Whether to run Purity evaluation"; - -private: - param_t globalParam; -}; - -py::tuple run() { - warning(); - // Access the parameters using the global `param` object - param_t ¶m = GetGlobalParam(); - // Run algorithm producing results. - auto res = SESAME::RunBenchmark(param); - // return py::make_tuple(res.first, res.second); - return py::make_tuple(); -} - -// Initialize the module -PYBIND11_MODULE(benne, m) { - m.doc() = "Module documentation string"; - - py::class_(m, - "Param") - .def(py::init<>()) - .def_property("algo", &Param::algo, &Param::set_algo, Param::algo_doc) - .def_property("input_file", &Param::input_file, &Param::set_input_file, - Param::input_file_doc) - .def_property("num_points", &Param::num_points, &Param::set_num_points, - Param::num_points_doc) - .def_property("dim", &Param::dim, &Param::set_dim, Param::dim_doc) - .def_property("num_clusters", &Param::num_clusters, &Param::set_num_clusters, - Param::num_clusters_doc) - // BIRCH - .def_property("max_in_nodes", &Param::max_in_nodes, &Param::set_max_in_nodes, - Param::max_in_nodes_doc) - .def_property("max_leaf_nodes", &Param::max_leaf_nodes, &Param::set_max_leaf_nodes, - Param::max_leaf_nodes_doc) - .def_property("distance_threshold", &Param::distance_threshold, &Param::set_distance_threshold, - Param::distance_threshold_doc) - // StreamKM++ - .def_property("seed", &Param::seed, &Param::set_seed, Param::seed_doc) - .def_property("coreset_size", &Param::coreset_size, &Param::set_coreset_size, - Param::coreset_size_doc) - // EDMStream - .def_property("radius", &Param::radius, &Param::set_radius, Param::radius_doc) - .def_property("delta", &Param::delta, &Param::set_delta, Param::delta_doc) - .def_property("beta", &Param::beta, &Param::set_beta, Param::beta_doc) - .def_property("buf_size", &Param::buf_size, &Param::set_buf_size, Param::buf_size_doc) - .def_property("alpha", &Param::alpha, &Param::set_alpha, Param::alpha_doc) - .def_property("lambda", &Param::lambda, &Param::set_lambda, Param::lambda_doc) - // DBStream - .def_property("clean_interval", &Param::clean_interval, &Param::set_clean_interval, - Param::clean_interval_doc) - .def_property("min_weight", &Param::min_weight, &Param::set_min_weight, - Param::min_weight_doc) - .def_property("base", &Param::base, &Param::set_base, Param::base_doc) - // DStream - .def_property("cm", &Param::cm, &Param::set_cm, Param::cm_doc) - .def_property("cl", &Param::cl, &Param::set_cl, Param::cl_doc) - .def_property("grid_width", &Param::grid_width, &Param::set_grid_width, - Param::grid_width_doc) - // DenStream - .def_property("min_points", &Param::min_points, &Param::set_min_points, - Param::min_points_doc) - .def_property("epsilon", &Param::epsilon, &Param::set_epsilon, Param::epsilon_doc) - .def_property("mu", &Param::mu, &Param::set_mu, Param::mu_doc) - // Clustream - .def_property("num_last_arr", &Param::num_last_arr, &Param::set_num_last_arr, - Param::num_last_arr_doc) - .def_property("time_window", &Param::time_window, &Param::set_time_window, - Param::time_window_doc) - .def_property("num_online_clusters", &Param::num_online_clusters, &Param::set_num_online_clusters, - Param::num_online_clusters_doc) - // SL-KMeans - .def_property("delta_grid", &Param::delta_grid, &Param::set_delta_grid, - Param::delta_grid_doc) - .def_property("num_samples", &Param::num_samples, &Param::set_num_samples, - Param::num_samples_doc) - // Generic - .def_property("landmark", &Param::landmark, &Param::set_landmark, Param::landmark_doc) - .def_property("sliding", &Param::sliding, &Param::set_sliding, Param::sliding_doc) - .def_property("outlier_distance_threshold", &Param::outlier_distance_threshold, - &Param::set_outlier_distance_threshold, Param::outlier_distance_threshold_doc) - .def_property("outlier_cap", &Param::outlier_cap, &Param::set_outlier_cap, - Param::outlier_cap_doc) - .def_property("outlier_density_threshold", &Param::outlier_density_threshold, - &Param::set_outlier_density_threshold, Param::outlier_density_threshold_doc) - .def_property("neighbor_distance", &Param::neighbor_distance, &Param::set_neighbor_distance, - Param::neighbor_distance_doc) - .def_property("k", &Param::k, &Param::set_k, Param::k_doc) - .def_property("arr_rate", &Param::arr_rate, &Param::set_arr_rate, Param::arr_rate_doc) - .def_property("run_offline", &Param::run_offline, &Param::set_run_offline, - Param::run_offline_doc) - .def_property("run_eval", &Param::run_eval, &Param::set_run_eval, Param::run_eval_doc) - .def_property("run_cmm", &Param::run_cmm, &Param::set_run_cmm, Param::run_cmm_doc) - .def_property("run_pur", &Param::run_pur, &Param::set_run_pur, Param::run_pur_doc); - - py::enum_(m, "AlgoType") - .value("BIRCH", BirchType) - .value("STREAMKM", StreamKMeansType) - .value("CLUSTREAM", CluStreamType) - .value("DENSTREAM", DenStreamType) - .value("DBSTREAM", DBStreamType) - .value("EDMSTREAM", EDMStreamType) - .value("DSTREAM", DStreamType) - .value("SLKMEANS", SLKMeansType); - // Expose the algo_names array as a Python list - py::list algoNames; - for (int i = 0; i < sizeof(algo_names) / sizeof(algo_names[0]); ++i) { - algoNames.append(algo_names[i]); - } - m.attr("algo_names") = algoNames; - - m.def("run", &run, py::return_value_policy::reference); -} - diff --git a/python/src/const.h b/python/src/const.h deleted file mode 100644 index cfe74010..00000000 --- a/python/src/const.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2023 IntelliStream team (https://github.com/intellistream) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SESAME_CONST_H -#define SESAME_CONST_H - -#include "Utils/BenchmarkUtils.hpp" -#include "Sources/DataSourceFactory.hpp" -#include "Sinks/DataSinkFactory.hpp" -#include "Algorithm/DataStructure/GenericFactory.hpp" -#include "Algorithm/AlgorithmFactory.hpp" -#include -#include -#include -#include - -void warning(); - -void warning() { -#ifndef NDEBUG - std::cerr << "\033[1;31m#####################################################" - "#######\n" - << "# #\n" - << "# DON'T run in debug mode. #\n" - << "# #\n" - << "############################################################" - "\033[0m\n"; - sleep(2); -#endif - -} - -#endif //SESAME_CONST_H diff --git a/scripts/README.md b/scripts/README.md index afa1d11f..93564380 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -71,11 +71,3 @@ beta = 0.2, epsilon = 16, mu = 10, lambda = 0.25, base = 2 (fixed), buf_sizeSize **G9:** 同DStream, 新加landmark范围: 1000 / 2000 / 4000 / 8000 / 16000 / 32000 / 50000 / 100000, cm范围全部修改成[20, 10, 100], cl范围修改成[2,2,10] - - -#### Potential Problem - -1. G1的k在每个数据集上直接设置成10000 -2. G9的purity出现0和Null -3. G8在KDD,sensor和Insects上的purity能不能调的更高一点?要都比birch高 -4. G4为啥FCT上0 diff --git a/setup.py b/setup.py index 595b46a3..d25c52f3 100644 --- a/setup.py +++ b/setup.py @@ -1,142 +1,145 @@ import os -import distutils.ccompiler -import distutils.sysconfig -import setuptools -import shlex +import re import subprocess -from setuptools import setup, Extension -from setuptools.command.build_ext import build_ext -from distutils.errors import CompileError, LinkError import sys +from pathlib import Path -__version__ = '0.0.1' - -# list of all sources files -sources = [ - "src/Timer/TimeMeter.cpp", - "src/Utils/BenchmarkUtils.cpp", - "src/Utils/UtilityFunctions.cpp", - "src/Evaluation/Euclidean.cpp", - "src/Evaluation/NMI.cpp", - "src/Evaluation/CMM.cpp", - "src/Evaluation/Purity.cpp", - "src/Evaluation/Evaluation.cpp", - "src/Sinks/DataSink.cpp", - "src/Sources/DataSource.cpp", - "src/Engine/Engine.cpp", - "src/Engine/SimpleEngine.cpp", - "src/Engine/SingleThread.cpp", - "src/APIs/APIs.cpp", - "src/Algorithm/DesignAspect/V9.cpp", - "src/Algorithm/DesignAspect/V10.cpp", - "src/Algorithm/DesignAspect/V16.cpp", - "src/Algorithm/Benne.cpp", - "src/Algorithm/WindowModel/LandmarkWindow.cpp", - "src/Algorithm/WindowModel/DampedWindow.cpp", - "src/Algorithm/WindowModel/WindowModel.cpp", - "src/Algorithm/WindowModel/WindowFactory.cpp", - "src/Algorithm/OfflineRefinement/OfflineRefinement.cpp", - "src/Algorithm/OfflineRefinement/KMeans.cpp", - "src/Algorithm/OfflineRefinement/DBSCAN.cpp", - "src/Algorithm/OfflineRefinement/ConnectedRegions.cpp", - "src/Algorithm/DataStructure/Point.cpp", - "src/Algorithm/DataStructure/TreeNode.cpp", - "src/Algorithm/DataStructure/CoresetTree.cpp", - "src/Algorithm/DataStructure/MicroCluster.cpp", - "src/Algorithm/DataStructure/Snapshot.cpp", - "src/Algorithm/DataStructure/WeightedAdjacencyList.cpp", - "src/Algorithm/DataStructure/DataStructureFactory.cpp", - "src/Algorithm/DataStructure/CFTree.cpp", - "src/Algorithm/DataStructure/DPTree.cpp", - "src/Algorithm/DataStructure/DPNode.cpp", - "src/Algorithm/DataStructure/Cache.cpp", - "src/Algorithm/DataStructure/OutlierResevoir.cpp", - "src/Algorithm/DataStructure/FeatureVector.cpp", - "src/Algorithm/DataStructure/CharacteristicsVector.cpp", - "src/Algorithm/DataStructure/DensityGrid.cpp", - "src/Algorithm/DataStructure/GridCluster.cpp", - "src/Algorithm/DataStructure/MeyersonSketch.cpp", - "src/Algorithm/StreamKM.cpp", - "src/Algorithm/CluStream.cpp", - "src/Algorithm/Birch.cpp", - "src/Algorithm/DenStream.cpp", - "src/Algorithm/EDMStream.cpp", - "src/Algorithm/DBStream.cpp", - "src/Algorithm/DStream.cpp", - "src/Algorithm/SlidingWindowClustering.cpp", - "src/Algorithm/Algorithm.cpp", - "src/Algorithm/AlgorithmFactory.cpp", - "benchmark/src/Benchmark.cpp", - "python/src/bindings.cpp" -] - -# list of all include directories -include_dirs = [ - "src", - "include" -] - -os.environ["CC"] = "g++" -os.environ["CXX"] = "g++" - -extension = Extension( - "benne", - sources=sources, - include_dirs=include_dirs, - library_dirs=['/usr/lib/x86_64-linux-gnu/'], - language="c++", - extra_compile_args=[ - "-std=c++20", - "-Wall", - "-Werror=return-type", - "-fconcepts-diagnostics-depth=2", - "-fopenmp", - "-march=native", - "-Wno-ignored-qualifiers", - "-Wno-sign-compare", - "-O3", - "-DNDEBUG", - "-fPIC", - "-rdynamic" - ], - extra_link_args=['-fopenmp'], # explicitly link libomp - libraries=['gflags', 'gomp', '/usr/lib/x86_64-linux-gnu/libgomp.so.1'], # Use 'gomp' and 'omp' -) +from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext +from pybind11.setup_helpers import Pybind11Extension + +__version__ = "0.1.0" + +# Convert distutils Windows platform specifiers to CMake -A arguments +PLAT_TO_CMAKE = { + "win32": "Win32", + "win-amd64": "x64", + "win-arm32": "ARM", + "win-arm64": "ARM64", +} + +# get cpu cores +os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str(os.cpu_count() // 2 + 1) + + +# A CMakeExtension needs a sourcedir instead of a file list. +# The name must be the _single_ output extension from the CMake build. +# If you need multiple extensions, see scikit-build. +class CMakeExtension(Extension): + def __init__(self, name: str, sourcedir: str = "") -> None: + super().__init__(name, sources=[]) + self.sourcedir = os.fspath(Path(sourcedir).resolve()) + + +class CMakeBuild(build_ext): + def build_extension(self, ext: CMakeExtension) -> None: + # Must be in this form due to bug in .resolve() only fixed in Python 3.10+ + ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name) + extdir = ext_fullpath.parent.resolve() + + # Using this requires trailing slash for auto-detection & inclusion of + # auxiliary "native" libs + + debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug + cfg = "Debug" if debug else "Release" + + # CMake lets you override the generator - we need to check this. + # Can be set with Conda-Build, for example. + cmake_generator = os.environ.get("CMAKE_GENERATOR", "") + + # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON + # SESAME_VERSION_INFO shows you how to pass a value into the C++ code + # from Python. + cmake_args = [ + f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}", + f"-DPYTHON_EXECUTABLE={sys.executable}", + f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm + f"-DENABLE_PYTHON=ON", + ] + build_args = [] + # Adding CMake arguments set as environment variable + # (needed e.g. to build for ARM OSx on conda-forge) + if "CMAKE_ARGS" in os.environ: + cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] + + # In this example, we pass in the version to C++. You might not need to. + cmake_args += [f"-DSESAME_VERSION_INFO={self.distribution.get_version()}"] + + if self.compiler.compiler_type != "msvc": + # Using Ninja-build since it a) is available as a wheel and b) + # multithreads automatically. MSVC would require all variables be + # exported for Ninja to pick it up, which is a little tricky to do. + # Users can override the generator with CMAKE_GENERATOR in CMake + # 3.15+. + if not cmake_generator or cmake_generator == "Ninja": + try: + import ninja + + ninja_executable_path = Path(ninja.BIN_DIR) / "ninja" + cmake_args += [ + "-GNinja", + f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}", + ] + except ImportError: + pass + + else: + # Single config generators are handled "normally" + single_config = any(x in cmake_generator for x in {"NMake", "Ninja"}) + + # CMake allows an arch-in-generator style for backward compatibility + contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"}) + + # Specify the arch if using MSVC generator, but only if it doesn't + # contain a backward-compatibility arch spec already in the + # generator name. + if not single_config and not contains_arch: + cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]] + + # Multi-config generators have a different way to specify configs + if not single_config: + cmake_args += [ + f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}" + ] + build_args += ["--config", cfg] + + if sys.platform.startswith("darwin"): + # Cross-compile support for macOS - respect ARCHFLAGS if set + archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", "")) + if archs: + cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))] + + # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level + # across all generators. + if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: + # self.parallel is a Python 3 only way to set parallel jobs by hand + # using -j in the build_ext call, not supported by pip or PyPA-build. + if hasattr(self, "parallel") and self.parallel: + # CMake 3.12+ only. + build_args += [f"-j{self.parallel}"] + + build_temp = Path(self.build_temp) / ext.name + if not build_temp.exists(): + build_temp.mkdir(parents=True) + + subprocess.run( + ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True + ) + subprocess.run( + ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True + ) -class CustomBuildExtCommand(build_ext): - """A custom build extension for adding compiler-specific options.""" - c_opts = { - 'msvc': ['/EHsc'], - 'unix': [], - } - - if sys.platform == 'darwin': - c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7', '-lomp'] - - def build_extensions(self): - ct = self.compiler.compiler_type - opts = self.c_opts.get(ct, []) - if ct == 'unix': - opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) - opts.append('-fopenmp') - opts.append('-std=c++20') - opts.append('-march=native') - # get output of python3 -m pybind11 --includes - # and split it into a list of include directories - opts.extend(shlex.split( - subprocess.getoutput('python3 -m pybind11 --includes'))) - for ext in self.extensions: - ext.extra_compile_args = opts - build_ext.build_extensions(self) setup( - name='benne', + name="pysame", version=__version__, - author='Shuhao', - url='https://github.com/intellistream/Sesame', - description='Benne using pybind11', - long_description='This project aims at building a scalable stream mining library on modern hardware. The repo contains currently several representative real-world stream clustering algorithms and several synthetic algorithms.', - ext_modules=[extension], - cmdclass={'build_ext': CustomBuildExtCommand}, + author="Shuhao Zhang", + url="https://github.com/intellistream/Sesame", + description="Stream clustering algorithms on modern hardware", + long_description="This project aims at building a scalable stream mining library on modern hardware. The repo contains currently several representative real-world stream clustering algorithms and several synthetic algorithms.", + ext_modules=[CMakeExtension("pysame")], + cmdclass={"build_ext": CMakeBuild}, zip_safe=False, -) \ No newline at end of file + python_requires=">=3.7", + include_package_data=False, +) diff --git a/src/APIs/APIs.cpp b/src/APIs/APIs.cpp deleted file mode 100644 index de38a8d7..00000000 --- a/src/APIs/APIs.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) - -// -// Created by Shuhao Zhang on 19/07/2021. -// -#include diff --git a/src/APIs/CMakeLists.txt b/src/APIs/CMakeLists.txt deleted file mode 100644 index 284c657c..00000000 --- a/src/APIs/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# https://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -add_source_sesame( - APIs.cpp -) -#add_subdirectory(Expressions) \ No newline at end of file diff --git a/src/Algorithm/Algorithm.cpp b/src/Algorithm/Algorithm.cpp index 59ded1ea..a700de79 100644 --- a/src/Algorithm/Algorithm.cpp +++ b/src/Algorithm/Algorithm.cpp @@ -7,66 +7,64 @@ #include "Algorithm/Algorithm.hpp" -namespace SESAME -{ -char const *algo_names[64] = {[BirchType] = "Birch", +namespace SESAME { +char const *algo_names[64] = {[BirchType] = "Birch", [StreamKMeansType] = "StreamKMeans", - [CluStreamType] = "CluStream", - [DenStreamType] = "DenStream", - [DBStreamType] = "DBStream", - [EDMStreamType] = "EDMStream", - [DStreamType] = "DStream", - [SLKMeansType] = "SLKMeans", - [BenneType] = "Benne", - [9] = "ERROR9", - [10] = "ERROR10", - [11] = "ERROR11", - [12] = "ERROR12", - [13] = "ERROR13", - [14] = "ERROR14", - [15] = "ERROR15", - [16] = "ERROR16", - [17] = "ERROR17", - [18] = "ERROR18", - [19] = "ERROR19", - [20] = "ERROR20", - [G1Stream] = "G1", - [G2Stream] = "G2", - [G3Stream] = "G3", - [G4Stream] = "G4", - [G5Stream] = "G5", - [G6Stream] = "G6", - [G7Stream] = "G7", - [G8Stream] = "G8", - [G9Stream] = "G9", - [G10Stream] = "G10", - [G11Stream] = "G11", - [G12Stream] = "G12", - [G13Stream] = "G13", - [G14Stream] = "G14", - [G15Stream] = "G15", - [G16Stream] = "G16"}; + [CluStreamType] = "CluStream", + [DenStreamType] = "DenStream", + [DBStreamType] = "DBStream", + [EDMStreamType] = "EDMStream", + [DStreamType] = "DStream", + [SLKMeansType] = "SLKMeans", + [BenneType] = "Benne", + [9] = "ERROR9", + [10] = "ERROR10", + [11] = "ERROR11", + [12] = "ERROR12", + [13] = "ERROR13", + [14] = "ERROR14", + [15] = "ERROR15", + [16] = "ERROR16", + [17] = "ERROR17", + [18] = "ERROR18", + [19] = "ERROR19", + [20] = "ERROR20", + [G1Stream] = "G1", + [G2Stream] = "G2", + [G3Stream] = "G3", + [G4Stream] = "G4", + [G5Stream] = "G5", + [G6Stream] = "G6", + [G7Stream] = "G7", + [G8Stream] = "G8", + [G9Stream] = "G9", + [G10Stream] = "G10", + [G11Stream] = "G11", + [G12Stream] = "G12", + [G13Stream] = "G13", + [G14Stream] = "G14", + [G15Stream] = "G15", + [G16Stream] = "G16"}; -char const *benne_suffix[4] = {[balance] = "Bal", - [accuracy] = "Acc", +char const *benne_suffix[4] = {[balance] = "Bal", + [accuracy] = "Acc", [efficiency] = "Eff", [accuracy_no_migration] = "AccNoMig"}; -void Algorithm::Store(std::string output_file, int dim, std::vector result) -{ - int numberOfCenters = (int)result.size(); - FILE *out = fopen(output_file.c_str(), "w"); - for (int i = 0; i < numberOfCenters; i++) - { - int l; - fprintf(out, "%f ", result[i]->getWeight()); - for (l = 0; l < dim; l++) - { - fprintf(out, "%f ", result[i]->getFeatureItem(l) / result[i]->getWeight()); - } - fprintf(out, "\n"); +void Algorithm::Store(std::string output_file, int dim, + std::vector result) { + int numberOfCenters = (int)result.size(); + FILE *out = fopen(output_file.c_str(), "w"); + for (int i = 0; i < numberOfCenters; i++) { + int l; + fprintf(out, "%f ", result[i]->getWeight()); + for (l = 0; l < dim; l++) { + fprintf(out, "%f ", + result[i]->getFeatureItem(l) / result[i]->getWeight()); } - fclose(out); + fprintf(out, "\n"); + } + fclose(out); } -} // namespace SESAME +} // namespace SESAME diff --git a/src/Algorithm/AlgorithmFactory.cpp b/src/Algorithm/AlgorithmFactory.cpp index b61a78db..309c9d24 100644 --- a/src/Algorithm/AlgorithmFactory.cpp +++ b/src/Algorithm/AlgorithmFactory.cpp @@ -7,6 +7,7 @@ #include "Algorithm/AlgorithmFactory.hpp" +#include "Algorithm/Benne.hpp" #include "Algorithm/Birch.hpp" #include "Algorithm/CluStream.hpp" #include "Algorithm/DBStream.hpp" @@ -16,150 +17,131 @@ #include "Algorithm/DenStream.hpp" #include "Algorithm/DesignAspect/Generic.hpp" #include "Algorithm/DesignAspect/V10.hpp" -#include "Algorithm/DesignAspect/V9.hpp" #include "Algorithm/DesignAspect/V16.hpp" +#include "Algorithm/DesignAspect/V9.hpp" #include "Algorithm/EDMStream.hpp" #include "Algorithm/OutlierDetection/OutlierDetection.hpp" #include "Algorithm/SlidingWindowClustering.hpp" #include "Algorithm/StreamKM.hpp" -#include "Algorithm/Benne.hpp" -namespace SESAME -{ -AlgorithmPtr AlgorithmFactory::create(param_t &cmd_params) -{ - switch (cmd_params.algo) - { - case (BenneType): - { - return std::make_shared(cmd_params); - } - case (StreamKMeansType): - { - return std::make_shared(cmd_params); - } - case (CluStreamType): - { - return std::make_shared(cmd_params); - } - case (DenStreamType): - { - return std::make_shared(cmd_params); - } - case (EDMStreamType): - { - return std::make_shared(cmd_params); - } - case (DBStreamType): - { - return std::make_shared(cmd_params); - } - case (DStreamType): - { - return std::make_shared(cmd_params); - } - case (SLKMeansType): - { - return std::make_shared(cmd_params); - } - case (G9Stream): - { - return std::make_shared(cmd_params); - } - case (G1Stream): - { - return std::make_shared, KMeans>>( - cmd_params); - } - case (G2Stream): - { - return std::make_shared, DBSCAN>>( - cmd_params); - } - case (G3Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G4Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G5Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (BirchType): - { - return std::make_shared< - StreamClustering>( - cmd_params); - } - case (G6Stream): - { - return std::make_shared< - StreamClustering>( - cmd_params); - } - case (G7Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G8Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G10Stream): - { - return std::make_shared(cmd_params); - } - case (G11Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G12Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G13Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G14Stream): - { - return std::make_shared, NoRefinement>>( - cmd_params); - } - case (G15Stream): - { - return std::make_shared< - StreamClustering, NoRefinement>>( - cmd_params); - } - case (G16Stream): - { - return std::make_shared(cmd_params); - } - default: throw std::invalid_argument("Unsupported algorithm"); - } +namespace SESAME { +AlgorithmPtr AlgorithmFactory::create(param_t &cmd_params) { + switch (cmd_params.algo) { + case (BenneType): { + return std::make_shared(cmd_params); + } + case (StreamKMeansType): { + return std::make_shared(cmd_params); + } + case (CluStreamType): { + return std::make_shared(cmd_params); + } + case (DenStreamType): { + return std::make_shared(cmd_params); + } + case (EDMStreamType): { + return std::make_shared(cmd_params); + } + case (DBStreamType): { + return std::make_shared(cmd_params); + } + case (DStreamType): { + return std::make_shared(cmd_params); + } + case (SLKMeansType): { + return std::make_shared(cmd_params); + } + case (G9Stream): { + return std::make_shared(cmd_params); + } + case (G1Stream): { + return std::make_shared< + StreamClustering, KMeans>>(cmd_params); + } + case (G2Stream): { + return std::make_shared< + StreamClustering, DBSCAN>>(cmd_params); + } + case (G3Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G4Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G5Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (BirchType): { + return std::make_shared>( + cmd_params); + } + case (G6Stream): { + return std::make_shared>( + cmd_params); + } + case (G7Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G8Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G10Stream): { + return std::make_shared(cmd_params); + } + case (G11Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G12Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G13Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G14Stream): { + return std::make_shared< + StreamClustering, NoRefinement>>( + cmd_params); + } + case (G15Stream): { + return std::make_shared, NoRefinement>>( + cmd_params); + } + case (G16Stream): { + return std::make_shared(cmd_params); + } + default: + throw std::invalid_argument("Unsupported algorithm"); + } } -} // namespace SESAME +} // namespace SESAME diff --git a/src/Algorithm/Benne.cpp b/src/Algorithm/Benne.cpp index 377459b6..3bc7b5c6 100644 --- a/src/Algorithm/Benne.cpp +++ b/src/Algorithm/Benne.cpp @@ -14,360 +14,319 @@ using namespace SESAME; using namespace std; -double calculateDispersion(const vector &queue_, SESAME::PointPtr newCenter); +double calculateDispersion(const vector &queue_, + SESAME::PointPtr newCenter); -Benne::Benne(param_t &cmd_params) : kmeans(cmd_params) -{ - param = cmd_params, T = param.benne_threshold, obj = param.obj; +Benne::Benne(param_t &cmd_params) : kmeans(cmd_params) { + param = cmd_params, T = param.benne_threshold, obj = param.obj; } Benne::~Benne() {} -void Benne::Init() -{ - sum_timer.Tick(); - if (obj == accuracy || obj == accuracy_no_migration) - { - windowSel = damped; - dataSel = CFT; - outlierSel = ODBT; - refineSel = Incre; - algo = make_shared, NoRefinement>>(param); - } - else if (obj == efficiency) - { - windowSel = sliding; - dataSel = Grids; - outlierSel = NoOD; - refineSel = NoRefine; - algo = make_shared(param); // problem - } - else - { - windowSel = landmark; - dataSel = CoreT; - outlierSel = ODT; - refineSel = OneShot; - algo = make_shared< - StreamClustering, KMeans>>(param); - } - first_algo = windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel; - change_log.push_back(make_pair(0, first_algo)); - algo->Init(); +void Benne::Init() { + sum_timer.Tick(); + if (obj == accuracy || obj == accuracy_no_migration) { + windowSel = damped; + dataSel = CFT; + outlierSel = ODBT; + refineSel = Incre; + algo = make_shared< + StreamClustering, NoRefinement>>(param); + } else if (obj == efficiency) { + windowSel = sliding; + dataSel = Grids; + outlierSel = NoOD; + refineSel = NoRefine; + algo = make_shared(param); // problem + } else { + windowSel = landmark; + dataSel = CoreT; + outlierSel = ODT; + refineSel = OneShot; + algo = make_shared, KMeans>>( + param); + } + first_algo = windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel; + change_log.push_back(make_pair(0, first_algo)); + algo->Init(); } -void Benne::RunOnline(const PointPtr input) -{ - algo->RunOnline(input); - if (queue_.size() < T.queue_size) - { - queue_.push_back(input); - } - else - { - det_timer.Tick(); - auto old_algo = windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel; - Train(input); - auto new_algo = Infer(input); - det_timer.Tock(); - if (old_algo != new_algo) - { - change_count++; - change_log.push_back(make_pair(input->index, new_algo)); - cerr << "benne algo changes from " << hex << old_algo << " to " << new_algo << " when #" - << dec << input->index << endl; - } - mig_timer.Tick(); - UpdateAlgo(old_algo, new_algo); - vector emptyQueue; - queue_.swap(emptyQueue); - mig_timer.Tock(); +void Benne::RunOnline(const PointPtr input) { + algo->RunOnline(input); + if (queue_.size() < T.queue_size) { + queue_.push_back(input); + } else { + det_timer.Tick(); + auto old_algo = + windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel; + Train(input); + auto new_algo = Infer(input); + det_timer.Tock(); + if (old_algo != new_algo) { + change_count++; + change_log.push_back(make_pair(input->index, new_algo)); + cerr << "benne algo changes from " << hex << old_algo << " to " + << new_algo << " when #" << dec << input->index << endl; } - if (refineSel == Incre && (input->index > 0 && input->index % INCRE_REF_CNT == 0)) - { - ref_timer.Tick(); - vector temp_centers, new_centers; - algo->OutputOnline(temp_centers); - if (temp_centers.size()) cerr << "temp_centers size: " << temp_centers.size() << endl; - kmeans.Run(param, temp_centers, new_centers); - algo->Init(); - for (auto ¢er : new_centers) - { - algo->Insert(center); - } - ref_timer.Tock(); + mig_timer.Tick(); + UpdateAlgo(old_algo, new_algo); + vector emptyQueue; + queue_.swap(emptyQueue); + mig_timer.Tock(); + } + if (refineSel == Incre && + (input->index > 0 && input->index % INCRE_REF_CNT == 0)) { + ref_timer.Tick(); + vector temp_centers, new_centers; + algo->OutputOnline(temp_centers); + if (temp_centers.size()) + cerr << "temp_centers size: " << temp_centers.size() << endl; + kmeans.Run(param, temp_centers, new_centers); + algo->Init(); + for (auto ¢er : new_centers) { + algo->Insert(center); } - lat_timer.Add(input->toa); + ref_timer.Tock(); + } + lat_timer.Add(input->toa); } -void Benne::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - cout << "change_count: " << change_count << endl; - cout << "change_log: '"; - for (auto &p : change_log) - { - cout << hex << p.second << dec << "@" << p.first << ";"; - } - cout << "'" << endl; - cout << "first_algo: " << hex << first_algo << endl; - cout << "final_algo: " << (windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel) << dec - << endl; - cout << "mig_us: " << mig_timer.sum / 1000 << endl; - cout << "det_us: " << det_timer.sum / 1000 << endl; - // assert(centers.size() <= 50000); - for (auto ¢er : materialized_centers) sinkPtr->put(center); - // for (auto ¢er : centers) sinkPtr->put(center); - algo->RunOffline(sinkPtr); - win_timer.sum += algo->win_timer.sum; - ds_timer.sum += algo->ds_timer.sum; - out_timer.sum += algo->out_timer.sum; - ref_timer.sum += algo->ref_timer.sum; - sum_timer.Tock(); +void Benne::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + cout << "change_count: " << change_count << endl; + cout << "change_log: '"; + for (auto &p : change_log) { + cout << hex << p.second << dec << "@" << p.first << ";"; + } + cout << "'" << endl; + cout << "first_algo: " << hex << first_algo << endl; + cout << "final_algo: " + << (windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel) << dec + << endl; + cout << "mig_us: " << mig_timer.sum / 1000 << endl; + cout << "det_us: " << det_timer.sum / 1000 << endl; + // assert(centers.size() <= 50000); + for (auto ¢er : materialized_centers) + sinkPtr->put(center); + // for (auto ¢er : centers) sinkPtr->put(center); + algo->RunOffline(sinkPtr); + win_timer.sum += algo->win_timer.sum; + ds_timer.sum += algo->ds_timer.sum; + out_timer.sum += algo->out_timer.sum; + ref_timer.sum += algo->ref_timer.sum; + sum_timer.Tock(); } -void Benne::Train(const PointPtr &input) -{ - int highDimData = 0; - int outlierNumber = 0; - double var = 0; - PointPtr newCenter = make_shared(input->dim); - vector temp_centers; - algo->OutputOnline(temp_centers); - for (auto &frontElement : queue_) - { - // Obtain the center of the queue - for (int i = 0; i < frontElement->dim; i++) - { - newCenter->feature[i] += frontElement->feature[i] / queue_.size(); - } - if (frontElement->dim > 30) - { - highDimData++; - } - double minDist = DBL_MAX; - for (auto ¢er : temp_centers) - { - double dist = frontElement->L2Dist(center); - if (dist != 0.0 && dist < minDist) - { - minDist = dist; - } - } - if (minDist > T.outliers_dist && minDist < DBL_MAX) - { - outlierNumber++; - } - } - var = calculateDispersion(queue_, newCenter); - if (highDimData > queue_.size() * 0.5) - { - chara.highDimension = true; - } - else - { - chara.highDimension = false; +void Benne::Train(const PointPtr &input) { + int highDimData = 0; + int outlierNumber = 0; + double var = 0; + PointPtr newCenter = make_shared(input->dim); + vector temp_centers; + algo->OutputOnline(temp_centers); + for (auto &frontElement : queue_) { + // Obtain the center of the queue + for (int i = 0; i < frontElement->dim; i++) { + newCenter->feature[i] += frontElement->feature[i] / queue_.size(); } - if (var > T.variance) - { - chara.frequentDrift = true; + if (frontElement->dim > 30) { + highDimData++; } - else - { - chara.frequentDrift = false; + double minDist = DBL_MAX; + for (auto ¢er : temp_centers) { + double dist = frontElement->L2Dist(center); + if (dist != 0.0 && dist < minDist) { + minDist = dist; + } } - if (outlierNumber > queue_.size() * 0.5) - { - chara.manyOutliers = true; - } - else - { - chara.manyOutliers = false; + if (minDist > T.outliers_dist && minDist < DBL_MAX) { + outlierNumber++; } + } + var = calculateDispersion(queue_, newCenter); + if (highDimData > queue_.size() * 0.5) { + chara.highDimension = true; + } else { + chara.highDimension = false; + } + if (var > T.variance) { + chara.frequentDrift = true; + } else { + chara.frequentDrift = false; + } + if (outlierNumber > queue_.size() * 0.5) { + chara.manyOutliers = true; + } else { + chara.manyOutliers = false; + } } -int Benne::Infer(const PointPtr &input) -{ - if (obj == accuracy || obj == accuracy_no_migration) - { - if (chara.frequentDrift) - { - dataSel != MCs ? ds_changed = true : ds_changed = false; - dataSel = MCs; - } - else - { - dataSel != CFT ? ds_changed = true : ds_changed = false; - dataSel = CFT; - } - if (chara.manyOutliers) - { - (windowSel != landmark || outlierSel != ODBT) ? ds_changed = true : ds_changed = false; - windowSel = landmark; - outlierSel = ODBT; - } - else - { - windowSel != damped ? ds_changed = true : ds_changed = false; - windowSel = damped; - if (chara.highDimension) - { - outlierSel != ODB ? ds_changed = true : ds_changed = false; - outlierSel = ODB; - } - else - { - outlierSel != ODBT ? ds_changed = true : ds_changed = false; - outlierSel = ODBT; - } - } - refineSel != Incre ? ds_changed = true : ds_changed = false; - refineSel = Incre; +int Benne::Infer(const PointPtr &input) { + if (obj == accuracy || obj == accuracy_no_migration) { + if (chara.frequentDrift) { + dataSel != MCs ? ds_changed = true : ds_changed = false; + dataSel = MCs; + } else { + dataSel != CFT ? ds_changed = true : ds_changed = false; + dataSel = CFT; + } + if (chara.manyOutliers) { + (windowSel != landmark || outlierSel != ODBT) ? ds_changed = true + : ds_changed = false; + windowSel = landmark; + outlierSel = ODBT; + } else { + windowSel != damped ? ds_changed = true : ds_changed = false; + windowSel = damped; + if (chara.highDimension) { + outlierSel != ODB ? ds_changed = true : ds_changed = false; + outlierSel = ODB; + } else { + outlierSel != ODBT ? ds_changed = true : ds_changed = false; + outlierSel = ODBT; + } } - else if (obj == efficiency) - { - if (chara.frequentDrift) - { - (dataSel != DPT || windowSel != landmark) ? ds_changed = true : ds_changed = false; - dataSel = DPT; - windowSel = landmark; - } - else - { - (dataSel != Grids || windowSel != sliding) ? ds_changed = true : ds_changed = false; - dataSel = Grids; - windowSel = sliding; - } - (outlierSel != NoOD || refineSel != NoRefine) ? ds_changed = true : ds_changed = false; - outlierSel = NoOD; - refineSel = NoRefine; + refineSel != Incre ? ds_changed = true : ds_changed = false; + refineSel = Incre; + } else if (obj == efficiency) { + if (chara.frequentDrift) { + (dataSel != DPT || windowSel != landmark) ? ds_changed = true + : ds_changed = false; + dataSel = DPT; + windowSel = landmark; + } else { + (dataSel != Grids || windowSel != sliding) ? ds_changed = true + : ds_changed = false; + dataSel = Grids; + windowSel = sliding; } - else - { - if (chara.frequentDrift) - { - dataSel != AMS ? ds_changed = true : ds_changed = false; - dataSel = AMS; - } - else - { - dataSel != CoreT ? ds_changed = true : ds_changed = false; - dataSel = CoreT; - } - if (chara.highDimension) - { - outlierSel != OD ? ds_changed = true : ds_changed = false; - outlierSel = OD; - } - else - { - outlierSel != ODT ? ds_changed = true : ds_changed = false; - outlierSel = ODT; - } - (windowSel != landmark || refineSel != OneShot) ? ds_changed = true : ds_changed = false; - windowSel = landmark; - refineSel = OneShot; + (outlierSel != NoOD || refineSel != NoRefine) ? ds_changed = true + : ds_changed = false; + outlierSel = NoOD; + refineSel = NoRefine; + } else { + if (chara.frequentDrift) { + dataSel != AMS ? ds_changed = true : ds_changed = false; + dataSel = AMS; + } else { + dataSel != CoreT ? ds_changed = true : ds_changed = false; + dataSel = CoreT; } + if (chara.highDimension) { + outlierSel != OD ? ds_changed = true : ds_changed = false; + outlierSel = OD; + } else { + outlierSel != ODT ? ds_changed = true : ds_changed = false; + outlierSel = ODT; + } + (windowSel != landmark || refineSel != OneShot) ? ds_changed = true + : ds_changed = false; + windowSel = landmark; + refineSel = OneShot; + } - int combine = windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel; - return combine; + int combine = windowSel << 12 | dataSel << 8 | outlierSel << 4 | refineSel; + return combine; } -void Benne::UpdateAlgo(int old_algo, int new_algo) -{ - if (old_algo == new_algo) return; - vector temp_centers; - algo->OutputOnline(temp_centers); - win_timer.sum += algo->win_timer.sum; - ds_timer.sum += algo->ds_timer.sum; - out_timer.sum += algo->out_timer.sum; - ref_timer.sum += algo->ref_timer.sum; - switch (new_algo) - { - case 0x0040: - algo = make_shared< - StreamClustering, NoRefinement>>( - param); - break; - case 0x2020: - algo = make_shared< - StreamClustering, NoRefinement>>( +void Benne::UpdateAlgo(int old_algo, int new_algo) { + if (old_algo == new_algo) + return; + vector temp_centers; + algo->OutputOnline(temp_centers); + win_timer.sum += algo->win_timer.sum; + ds_timer.sum += algo->ds_timer.sum; + out_timer.sum += algo->out_timer.sum; + ref_timer.sum += algo->ref_timer.sum; + switch (new_algo) { + case 0x0040: + algo = make_shared, NoRefinement>>( + param); + break; + case 0x2020: + algo = make_shared, NoRefinement>>( + param); + break; + case 0x2040: + algo = make_shared, NoRefinement>>( + param); + break; + case 0x0140: + algo = make_shared< + StreamClustering, NoRefinement>>(param); + break; + case 0x2120: + algo = make_shared< + StreamClustering, NoRefinement>>(param); + break; + case 0x2140: + algo = make_shared< + StreamClustering, NoRefinement>>(param); + break; + case 0x0501: + algo = + make_shared, KMeans>>( param); - break; - case 0x2040: - algo = make_shared< - StreamClustering, NoRefinement>>( - param); - break; - case 0x0140: - algo = make_shared, NoRefinement>>(param); - break; - case 0x2120: - algo = make_shared, NoRefinement>>(param); - break; - case 0x2140: - algo = make_shared, NoRefinement>>(param); - break; - case 0x0501: - algo = make_shared< - StreamClustering, KMeans>>( - param); - break; - case 0x0201: - algo = make_shared< - StreamClustering, KMeans>>(param); - break; - case 0x0531: - algo = make_shared< - StreamClustering, KMeans>>( - param); - break; - case 0x0231: - algo = make_shared< - StreamClustering, KMeans>>(param); - break; - case 0x1412: - case 0x1402: algo = std::make_shared(param); break; - case 0x0312: algo = std::make_shared(param); break; - default: cerr << "Error: no such algorithm: " << hex << new_algo << dec << endl; exit(-1); + break; + case 0x0201: + algo = make_shared, KMeans>>(param); + break; + case 0x0531: + algo = make_shared, KMeans>>( + param); + break; + case 0x0231: + algo = make_shared, KMeans>>( + param); + break; + case 0x1412: + case 0x1402: + algo = std::make_shared(param); + break; + case 0x0312: + algo = std::make_shared(param); + break; + default: + cerr << "Error: no such algorithm: " << hex << new_algo << dec << endl; + exit(-1); + } + algo->Init(); + if (obj == efficiency || obj == accuracy_no_migration) { + for (auto ¢er : temp_centers) { + materialized_centers.push_back(center); } - algo->Init(); - if (obj == efficiency || obj == accuracy_no_migration) - { - for (auto ¢er : temp_centers) - { - materialized_centers.push_back(center); - } - } - else - { - for (auto ¢er : temp_centers) - { - algo->Insert(center); - } + } else { + for (auto ¢er : temp_centers) { + algo->Insert(center); } + } } -double calculateDispersion(const vector &queue_, PointPtr newCenter) -{ - // calculate dispersion - PointPtr variance = make_shared(newCenter->dim); - for (auto &point : queue_) - { - for (int i = 0; i < newCenter->dim; i++) - { - variance->feature[i] += - pow((point->getFeatureItem(i) - newCenter->getFeatureItem(i)), 2) / queue_.size(); - } - } - double dispersion = 0; - for (int i = 0; i < newCenter->dim; i++) - { - dispersion += sqrt(variance->feature[i]); +double calculateDispersion(const vector &queue_, PointPtr newCenter) { + // calculate dispersion + PointPtr variance = make_shared(newCenter->dim); + for (auto &point : queue_) { + for (int i = 0; i < newCenter->dim; i++) { + variance->feature[i] += + pow((point->getFeatureItem(i) - newCenter->getFeatureItem(i)), 2) / + queue_.size(); } - return dispersion; + } + double dispersion = 0; + for (int i = 0; i < newCenter->dim; i++) { + dispersion += sqrt(variance->feature[i]); + } + return dispersion; } diff --git a/src/Algorithm/Birch.cpp b/src/Algorithm/Birch.cpp index 02c3327a..1ba923c8 100644 --- a/src/Algorithm/Birch.cpp +++ b/src/Algorithm/Birch.cpp @@ -4,461 +4,405 @@ #include #include -void SESAME::Birch::Init() -{ - this->cfTree = DataStructureFactory::createCFTree(); - this->cfTree->setB(BirchParam.max_in_nodes); - this->cfTree->setL(BirchParam.max_leaf_nodes); - this->cfTree->setT(BirchParam.distance_threshold); - this->root = DataStructureFactory::createNode(); - this->root->setIsLeaf(true); - sum_timer.Tick(); +void SESAME::Birch::Init() { + this->cfTree = DataStructureFactory::createCFTree(); + this->cfTree->setB(BirchParam.max_in_nodes); + this->cfTree->setL(BirchParam.max_leaf_nodes); + this->cfTree->setT(BirchParam.distance_threshold); + this->root = DataStructureFactory::createNode(); + this->root->setIsLeaf(true); + sum_timer.Tick(); } -void SESAME::Birch::RunOnline(const SESAME::PointPtr input) -{ - ds_timer.Tick(); - forwardInsert(input); - ds_timer.Tock(); - lat_timer.Add(input->toa); +void SESAME::Birch::RunOnline(const SESAME::PointPtr input) { + ds_timer.Tick(); + forwardInsert(input); + ds_timer.Tock(); + lat_timer.Add(input->toa); } -void SESAME::Birch::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - for (int i = 0; i < this->leafNodes.size(); i++) - { - PointPtr centroid = DataStructureFactory::createPoint(i, 1, BirchParam.dim, 0); - for (int j = 0; j < BirchParam.dim; j++) - { - centroid->setFeatureItem( - this->leafNodes[i]->getCF()->getLS().at(j) / this->leafNodes[i]->getCF()->getN(), - j); - } - centroid->setClusteringCenter(i); - sinkPtr->put(centroid); +void SESAME::Birch::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + for (int i = 0; i < this->leafNodes.size(); i++) { + PointPtr centroid = GenericFactory::New(BirchParam.dim, i); + for (int j = 0; j < BirchParam.dim; j++) { + centroid->setFeatureItem(this->leafNodes[i]->getCF()->getLS().at(j) / + this->leafNodes[i]->getCF()->getN(), + j); } - ref_timer.Tock(); - sum_timer.Tock(); + centroid->setClusteringCenter(i); + sinkPtr->put(centroid); + } + ref_timer.Tock(); + sum_timer.Tock(); } -SESAME::Birch::Birch(param_t &cmd_params) -{ - this->param = cmd_params; - this->BirchParam.num_points = cmd_params.num_points; - this->BirchParam.dim = cmd_params.dim; - this->BirchParam.max_in_nodes = cmd_params.max_in_nodes; - this->BirchParam.max_leaf_nodes = cmd_params.max_leaf_nodes; - this->BirchParam.distance_threshold = cmd_params.distance_threshold; +SESAME::Birch::Birch(param_t &cmd_params) { + this->param = cmd_params; + this->BirchParam.num_points = cmd_params.num_points; + this->BirchParam.dim = cmd_params.dim; + this->BirchParam.max_in_nodes = cmd_params.max_in_nodes; + this->BirchParam.max_leaf_nodes = cmd_params.max_leaf_nodes; + this->BirchParam.distance_threshold = cmd_params.distance_threshold; } SESAME::Birch::~Birch() {} // when a new point insert into the CF, update the CF N, LS and SS -void SESAME::Birch::updateNLS(SESAME::NodePtr &node, SESAME::PointPtr &point, bool updateAll) -{ - SESAME::NodePtr nodeSearch = node; - while (true) - { - SESAME::CFPtr cf = nodeSearch->getCF(); - vector tmpLS = cf->getLS(); - vector tmpSS = cf->getSS(); - if (tmpLS.empty()) - { - for (int i = 0; i < point->getDimension(); i++) - { - tmpLS.push_back(0); - tmpSS.push_back(0); - } - } - cf->setN(cf->getN() + 1); - for (int i = 0; i < point->getDimension(); i++) - { - tmpLS[i] += point->getFeatureItem(i); - tmpSS[i] += pow(point->getFeatureItem(i), 2); - } - cf->setLS(tmpLS); - cf->setSS(tmpSS); - if (nodeSearch->getParent() != nullptr && updateAll) - { - nodeSearch = nodeSearch->getParent(); - } - else - break; +void SESAME::Birch::updateNLS(SESAME::NodePtr &node, SESAME::PointPtr &point, + bool updateAll) { + SESAME::NodePtr nodeSearch = node; + while (true) { + SESAME::CFPtr cf = nodeSearch->getCF(); + vector tmpLS = cf->getLS(); + vector tmpSS = cf->getSS(); + if (tmpLS.empty()) { + for (int i = 0; i < point->getDimension(); i++) { + tmpLS.push_back(0); + tmpSS.push_back(0); + } + } + cf->setN(cf->getN() + 1); + for (int i = 0; i < point->getDimension(); i++) { + tmpLS[i] += point->getFeatureItem(i); + tmpSS[i] += pow(point->getFeatureItem(i), 2); } + cf->setLS(tmpLS); + cf->setSS(tmpSS); + if (nodeSearch->getParent() != nullptr && updateAll) { + nodeSearch = nodeSearch->getParent(); + } else + break; + } } // centroid index: -1(virtual) // centroid feature: mean of the feature of cluster points // centroid cluster: -1 -void SESAME::Birch::calculateCentroid(SESAME::CFPtr &cf, SESAME::PointPtr ¢roid) -{ - centroid->setIndex(-1); - centroid->setClusteringCenter(-1); - auto &ls = cf->LS; - const auto n = cf->getN(); - for (int i = 0; i < n; ++i) centroid->feature[i] = ls[i] / n; +void SESAME::Birch::calculateCentroid(SESAME::CFPtr &cf, + SESAME::PointPtr ¢roid) { + centroid->setIndex(-1); + centroid->setClusteringCenter(-1); + auto &ls = cf->LS; + const auto n = cf->getN(); + for (int i = 0; i < n; ++i) + centroid->feature[i] = ls[i] / n; } // use Manhattan Distance -void SESAME::Birch::pointToClusterDist(SESAME::PointPtr &insertPoint, SESAME::NodePtr &node, - double &dist) -{ - dist = 0; - SESAME::PointPtr centroid = make_shared(BirchParam.dim); - SESAME::CFPtr curCF = node->getCF(); - calculateCentroid(curCF, centroid); - dist = insertPoint->L1Dist(centroid); +void SESAME::Birch::pointToClusterDist(SESAME::PointPtr &insertPoint, + SESAME::NodePtr &node, double &dist) { + dist = 0; + SESAME::PointPtr centroid = make_shared(BirchParam.dim); + SESAME::CFPtr curCF = node->getCF(); + calculateCentroid(curCF, centroid); + dist = insertPoint->L1Dist(centroid); } // use Manhattan Distance -double SESAME::Birch::clusterToClusterDist(SESAME::NodePtr &nodeA, SESAME::NodePtr &nodeB) -{ - SESAME::PointPtr centroidA = make_shared(BirchParam.dim); - SESAME::PointPtr centroidB = make_shared(BirchParam.dim); - SESAME::CFPtr curCFA = nodeA->getCF(); - SESAME::CFPtr curCFB = nodeB->getCF(); - calculateCentroid(curCFA, centroidA); - calculateCentroid(curCFB, centroidB); - return centroidA->L1Dist(centroidB); +double SESAME::Birch::clusterToClusterDist(SESAME::NodePtr &nodeA, + SESAME::NodePtr &nodeB) { + SESAME::PointPtr centroidA = make_shared(BirchParam.dim); + SESAME::PointPtr centroidB = make_shared(BirchParam.dim); + SESAME::CFPtr curCFA = nodeA->getCF(); + SESAME::CFPtr curCFB = nodeB->getCF(); + calculateCentroid(curCFA, centroidA); + calculateCentroid(curCFB, centroidB); + return centroidA->L1Dist(centroidB); } // select the closest child cluster according to Manhattan Distance -void SESAME::Birch::selectChild(vector &children, SESAME::PointPtr &insertPoint, - SESAME::NodePtr &node) -{ - double dist = 0; - double temp = 0; - pointToClusterDist(insertPoint, children.at(0), dist); - node = children.at(0); - for (int i = 1; i < children.size(); i++) - { - pointToClusterDist(insertPoint, children.at(i), temp); - if (temp < dist) - { - dist = temp; - node = children.at(i); - } +void SESAME::Birch::selectChild(vector &children, + SESAME::PointPtr &insertPoint, + SESAME::NodePtr &node) { + double dist = 0; + double temp = 0; + pointToClusterDist(insertPoint, children.at(0), dist); + node = children.at(0); + for (int i = 1; i < children.size(); i++) { + pointToClusterDist(insertPoint, children.at(i), temp); + if (temp < dist) { + dist = temp; + node = children.at(i); } + } } // calculate the radius of a cluster -double SESAME::Birch::calculateRadius(SESAME::PointPtr &point, SESAME::PointPtr ¢roid) -{ - double denominator = 0; - double radius = 0; - for (int i = 0; i < point->getDimension(); i++) - { - denominator += pow(centroid->getFeatureItem(i) - point->getFeatureItem(i), 2); - } - radius = sqrt(denominator); - return radius; +double SESAME::Birch::calculateRadius(SESAME::PointPtr &point, + SESAME::PointPtr ¢roid) { + double denominator = 0; + double radius = 0; + for (int i = 0; i < point->getDimension(); i++) { + denominator += + pow(centroid->getFeatureItem(i) - point->getFeatureItem(i), 2); + } + radius = sqrt(denominator); + return radius; } void SESAME::Birch::calculateCorDistance(vector> &distance, - vector &nodes) -{ - const auto n = nodes.size(); - distance = vector>(n, vector(n, 0)); - auto centroids = vector(n); - for (int i = 0; i < n; i++) - { - centroids[i] = make_shared(BirchParam.dim); - auto cf = nodes[i]->getCF(); - calculateCentroid(cf, centroids[i]); - } - // calculate the correlate distance - for (int i = 0; i < nodes.size(); i++) - { - SESAME::PointPtr centroidA = centroids[i]; - for (int j = i + 1; j < nodes.size(); j++) - { - SESAME::PointPtr centroidB = centroids[j]; - auto dist = centroidA->L1Dist(centroidB); - distance[i][j] = dist; - distance[j][i] = dist; - } + vector &nodes) { + const auto n = nodes.size(); + distance = vector>(n, vector(n, 0)); + auto centroids = vector(n); + for (int i = 0; i < n; i++) { + centroids[i] = make_shared(BirchParam.dim); + auto cf = nodes[i]->getCF(); + calculateCentroid(cf, centroids[i]); + } + // calculate the correlate distance + for (int i = 0; i < nodes.size(); i++) { + SESAME::PointPtr centroidA = centroids[i]; + for (int j = i + 1; j < nodes.size(); j++) { + SESAME::PointPtr centroidB = centroids[j]; + auto dist = centroidA->L1Dist(centroidB); + distance[i][j] = dist; + distance[j][i] = dist; } + } } -void SESAME::Birch::setCFToBlankNode(SESAME::NodePtr &curNode, SESAME::PointPtr &point) -{ - SESAME::CFPtr curCF = curNode->getCF(); - curCF->setN(curCF->getN() + 1); - vector newLs; - vector newSs; - for (int i = 0; i < point->getDimension(); i++) - { - newLs.push_back(point->getFeatureItem(i)); - newSs.push_back(pow(point->getFeatureItem(i), 2)); - } - curCF->setSS(newSs); - curCF->setLS(newLs); +void SESAME::Birch::setCFToBlankNode(SESAME::NodePtr &curNode, + SESAME::PointPtr &point) { + SESAME::CFPtr curCF = curNode->getCF(); + curCF->setN(curCF->getN() + 1); + vector newLs; + vector newSs; + for (int i = 0; i < point->getDimension(); i++) { + newLs.push_back(point->getFeatureItem(i)); + newSs.push_back(pow(point->getFeatureItem(i), 2)); + } + curCF->setSS(newSs); + curCF->setLS(newLs); } -void SESAME::Birch::addNodeNLSToNode(SESAME::NodePtr &child, SESAME::NodePtr &parent) -{ - SESAME::CFPtr childCF = child->getCF(); - SESAME::CFPtr parCF = parent->getCF(); - parCF->setN(parCF->getN() + childCF->getN()); - vector newLs; - vector newSs; - for (int i = 0; i < childCF->getLS().size(); i++) - { - newLs.push_back(childCF->getLS().at(i) + parCF->getLS().at(i)); - newSs.push_back(childCF->getSS().at(i) + parCF->getSS().at(i)); - } - parCF->setLS(newLs); - parCF->setSS(newSs); +void SESAME::Birch::addNodeNLSToNode(SESAME::NodePtr &child, + SESAME::NodePtr &parent) { + SESAME::CFPtr childCF = child->getCF(); + SESAME::CFPtr parCF = parent->getCF(); + parCF->setN(parCF->getN() + childCF->getN()); + vector newLs; + vector newSs; + for (int i = 0; i < childCF->getLS().size(); i++) { + newLs.push_back(childCF->getLS().at(i) + parCF->getLS().at(i)); + newSs.push_back(childCF->getSS().at(i) + parCF->getSS().at(i)); + } + parCF->setLS(newLs); + parCF->setSS(newSs); } -void SESAME::Birch::initializeCF(SESAME::CFPtr &cf, int dim) -{ - vector ls = cf->getLS(); - vector ss = cf->getSS(); - for (int i = 0; i < dim; i++) - { - ls.push_back(0); - ss.push_back(0); - } - cf->setLS(ls); - cf->setSS(ss); +void SESAME::Birch::initializeCF(SESAME::CFPtr &cf, int dim) { + vector ls = cf->getLS(); + vector ss = cf->getSS(); + for (int i = 0; i < dim; i++) { + ls.push_back(0); + ss.push_back(0); + } + cf->setLS(ls); + cf->setSS(ss); } -void SESAME::Birch::clearChildParents(vector &children) -{ - for (auto child : children) - { - child->clearParents(); - } +void SESAME::Birch::clearChildParents(vector &children) { + for (auto child : children) { + child->clearParents(); + } } -void SESAME::Birch::forwardInsert(SESAME::PointPtr point) -{ - NodePtr curNode = this->root; - if (curNode->getCF()->getN() == 0) - { - updateNLS(curNode, point, true); - } - else - { - while (1) - { - vector childrenNode = curNode->getChildren(); - if (curNode->getIsLeaf()) - { - CFPtr curCF = curNode->getCF(); - if (curCF->getN() == 0) - { - initializeCF(curCF, point->getDimension()); - } - PointPtr centroid = make_shared(BirchParam.dim); - calculateCentroid(curCF, centroid); - if (calculateRadius(point, centroid) <= this->cfTree->getT()) - { // concept drift detection - // whether the new radius is lower than threshold T - auto a = calculateRadius(point, centroid); - if (point->getIndex() % 100 == 0) - { - std::cout << a; - } - updateNLS(curNode, point, true); +void SESAME::Birch::forwardInsert(SESAME::PointPtr point) { + NodePtr curNode = this->root; + if (curNode->getCF()->getN() == 0) { + updateNLS(curNode, point, true); + } else { + while (1) { + vector childrenNode = curNode->getChildren(); + if (curNode->getIsLeaf()) { + CFPtr curCF = curNode->getCF(); + if (curCF->getN() == 0) { + initializeCF(curCF, point->getDimension()); + } + PointPtr centroid = make_shared(BirchParam.dim); + calculateCentroid(curCF, centroid); + if (calculateRadius(point, centroid) <= + this->cfTree + ->getT()) { // concept drift detection + // whether the new radius is lower than threshold T + auto a = calculateRadius(point, centroid); + if (point->getIndex() % 100 == 0) { + std::cout << a; + } + updateNLS(curNode, point, true); - // means this point could get included in this cluster - // SESAME_DEBUG("No concept drift occurs(t <= T), insert tha point into the leaf - // node..."); - break; - // Normally insert the data point into the tree leafNode without concept drift - } - else - { - auto a = calculateRadius(point, centroid); - if (point->getIndex() % 100 == 0) - { - std::cout << a; - } - // concept drift adaption - // SESAME_DEBUG("Concept drift occurs(t > T), the current leaf node capacity - // reaches the threshold T"); - backwardEvolution(curNode, point); - break; - } - } - else - { - selectChild(childrenNode, point, curNode); - } + // means this point could get included in this cluster + // SESAME_DEBUG("No concept drift occurs(t <= T), insert tha point + // into the leaf node..."); + break; + // Normally insert the data point into the tree leafNode without + // concept drift + } else { + auto a = calculateRadius(point, centroid); + if (point->getIndex() % 100 == 0) { + std::cout << a; + } + // concept drift adaption + // SESAME_DEBUG("Concept drift occurs(t > T), the current leaf node + // capacity reaches the threshold T"); + backwardEvolution(curNode, point); + break; } + } else { + selectChild(childrenNode, point, curNode); + } } + } } // concept drift adaption -void SESAME::Birch::backwardEvolution(SESAME::NodePtr &curNode, SESAME::PointPtr &point) -{ - if (curNode->getParent() == nullptr) - { // means current node is root node - // SESAME_DEBUG("l <= L, create a new leaf node and insert the point into it(root change)"); - NodePtr newRoot = make_shared(); - newRoot->setIsLeaf(false); - newRoot->setChild(curNode); - curNode->setParent(newRoot); +void SESAME::Birch::backwardEvolution(SESAME::NodePtr &curNode, + SESAME::PointPtr &point) { + if (curNode->getParent() == nullptr) { // means current node is root node + // SESAME_DEBUG("l <= L, create a new leaf node and insert the point into + // it(root change)"); + NodePtr newRoot = make_shared(); + newRoot->setIsLeaf(false); + newRoot->setChild(curNode); + curNode->setParent(newRoot); - NodePtr newNode = make_shared(); - newNode->setIsLeaf(true); - newNode->setParent(newRoot); - vector curLS = curNode->getCF()->getLS(); - vector curSS = curNode->getCF()->getSS(); - int curN = curNode->getCF()->getN(); - newRoot->getCF()->setLS(curLS); - newRoot->getCF()->setSS(curSS); - newRoot->getCF()->setN(curN); - newRoot->setIndex(this->leafMask++); - // here we need to remove the old root and add the new one into the leafnodes set - this->leafNodes.push_back(newRoot); + NodePtr newNode = make_shared(); + newNode->setIsLeaf(true); + newNode->setParent(newRoot); + vector curLS = curNode->getCF()->getLS(); + vector curSS = curNode->getCF()->getSS(); + int curN = curNode->getCF()->getN(); + newRoot->getCF()->setLS(curLS); + newRoot->getCF()->setSS(curSS); + newRoot->getCF()->setN(curN); + newRoot->setIndex(this->leafMask++); + // here we need to remove the old root and add the new one into the + // leafnodes set + this->leafNodes.push_back(newRoot); - // update the parent node - newRoot->setChild(newNode); - updateNLS(newNode, point, true); - this->root = newRoot; - } - else - { - NodePtr parent = curNode->getParent(); - NodePtr newNode = make_shared(); - newNode->setIsLeaf(true); - newNode->setParent(parent); - parent->setChild(newNode); - updateNLS(newNode, point, false); - if (parent->getChildren().size() < this->cfTree->getL()) - { - // whether the number of CFs(clusters) in the current leaf node is lower thant threshold - // L - // SESAME_DEBUG("l <= L, create a new leaf node and insert the point into it"); + // update the parent node + newRoot->setChild(newNode); + updateNLS(newNode, point, true); + this->root = newRoot; + } else { + NodePtr parent = curNode->getParent(); + NodePtr newNode = make_shared(); + newNode->setIsLeaf(true); + newNode->setParent(parent); + parent->setChild(newNode); + updateNLS(newNode, point, false); + if (parent->getChildren().size() < this->cfTree->getL()) { + // whether the number of CFs(clusters) in the current leaf node is lower + // thant threshold + // L + // SESAME_DEBUG("l <= L, create a new leaf node and insert the point into + // it"); - // update the parent node - updateNLS(parent, point, true); + // update the parent node + updateNLS(parent, point, true); + } else { + // SESAME_DEBUG("l > L, parent node of the current leaf node capacity + // reaches the threshold L"); SESAME_DEBUG("split a new parent node from + // the old one "); + bool CurNodeIsLeaf = true; + while (true) { + NodePtr parParent; + if (parent->getParent() == nullptr) { + parParent = make_shared(); + parParent->setIsLeaf(false); + this->root = parParent; + CFPtr parCF = parent->getCF(); + parParent->setCF(parCF); + } else { + parParent = parent->getParent(); + parParent->removeChild(parent); + } + NodePtr newParentA = make_shared(); + NodePtr newParentB = make_shared(); + if (parent->getChildren().at(0)->getIsLeaf()) { + for (int i = 0; i < this->leafNodes.size(); i++) { + if (this->leafNodes.at(i)->getIndex() == parent->getIndex()) { + this->leafNodes.erase(this->leafNodes.begin() + i); + } + } + newParentA->setIndex(++this->leafMask); + newParentB->setIndex(++this->leafMask); + this->leafNodes.push_back(newParentA); + this->leafNodes.push_back(newParentB); } - else - { - // SESAME_DEBUG("l > L, parent node of the current leaf node capacity reaches the - // threshold L"); - // SESAME_DEBUG("split a new parent node from the old one "); - bool CurNodeIsLeaf = true; - while (true) - { - NodePtr parParent; - if (parent->getParent() == nullptr) - { - parParent = make_shared(); - parParent->setIsLeaf(false); - this->root = parParent; - CFPtr parCF = parent->getCF(); - parParent->setCF(parCF); - } - else - { - parParent = parent->getParent(); - parParent->removeChild(parent); - } - NodePtr newParentA = make_shared(); - NodePtr newParentB = make_shared(); - if (parent->getChildren().at(0)->getIsLeaf()) - { - for (int i = 0; i < this->leafNodes.size(); i++) - { - if (this->leafNodes.at(i)->getIndex() == parent->getIndex()) - { - this->leafNodes.erase(this->leafNodes.begin() + i); - } - } - newParentA->setIndex(++this->leafMask); - newParentB->setIndex(++this->leafMask); - this->leafNodes.push_back(newParentA); - this->leafNodes.push_back(newParentB); - } - - newParentB->setIsLeaf(false); - newParentA->setIsLeaf(false); - newParentB->setParent(parParent); - newParentA->setParent(parParent); - parParent->setChild(newParentB); - parParent->setChild(newParentA); - CFPtr cfA = newParentA->getCF(); - CFPtr cfB = newParentB->getCF(); - initializeCF(cfA, point->getDimension()); - initializeCF(cfB, point->getDimension()); - vector broNodes = parent->getChildren(); - vector> corCFDistance; - calculateCorDistance(corCFDistance, broNodes); + newParentB->setIsLeaf(false); + newParentA->setIsLeaf(false); + newParentB->setParent(parParent); + newParentA->setParent(parParent); + parParent->setChild(newParentB); + parParent->setChild(newParentA); + CFPtr cfA = newParentA->getCF(); + CFPtr cfB = newParentB->getCF(); + initializeCF(cfA, point->getDimension()); + initializeCF(cfB, point->getDimension()); - // choose two farthest CFs as seedA and seedB - int seedA = 0; - int seedB = 0; - double max = 0; - for (int i = 0; i < broNodes.size(); i++) - { - for (int j = i; j < broNodes.size(); j++) - { - if (max < corCFDistance[i][j]) - { - seedA = i; - seedB = j; - max = corCFDistance[i][j]; - } - } - } + vector broNodes = parent->getChildren(); + vector> corCFDistance; + calculateCorDistance(corCFDistance, broNodes); - // insert the child node into the nearest seed(A / B) - clearChildParents(broNodes); - // insert seedA node into parent A - newParentA->setChild(broNodes[seedA]); - broNodes[seedA]->setParent(newParentA); - addNodeNLSToNode(broNodes[seedA], newParentA); - // insert seedB node into parent B - newParentB->setChild(broNodes[seedB]); - broNodes[seedB]->setParent(newParentB); - addNodeNLSToNode(broNodes[seedB], newParentB); - // split other nodes into A and B - for (int i = 0; i < broNodes.size(); i++) - { - if (i != seedA and i != seedB) - { - if (corCFDistance[i][seedA] < corCFDistance[i][seedB]) - { - newParentA->setChild(broNodes[i]); - addNodeNLSToNode(broNodes[i], newParentA); - broNodes[i]->clearParents(); - broNodes[i]->setParent(newParentA); - } - else - { - newParentB->setChild(broNodes[i]); - addNodeNLSToNode(broNodes[i], newParentB); - broNodes[i]->clearParents(); - broNodes[i]->setParent(newParentB); - } - } - } - if (CurNodeIsLeaf) - { - updateNLS(parParent, point, true); - } + // choose two farthest CFs as seedA and seedB + int seedA = 0; + int seedB = 0; + double max = 0; + for (int i = 0; i < broNodes.size(); i++) { + for (int j = i; j < broNodes.size(); j++) { + if (max < corCFDistance[i][j]) { + seedA = i; + seedB = j; + max = corCFDistance[i][j]; + } + } + } - if (parParent->getChildren().size() <= this->cfTree->getB()) - { - // SESAME_DEBUG("b < B, remove the old node and insert the new nodeA and nodeB - // into the parent node"); - break; - } - else - { - // SESAME_DEBUG("b >= B, parent node of the current interior node capacity - // reaches the threshold B"); - curNode = curNode->getParent(); - parent = parParent; - CurNodeIsLeaf = false; - } + // insert the child node into the nearest seed(A / B) + clearChildParents(broNodes); + // insert seedA node into parent A + newParentA->setChild(broNodes[seedA]); + broNodes[seedA]->setParent(newParentA); + addNodeNLSToNode(broNodes[seedA], newParentA); + // insert seedB node into parent B + newParentB->setChild(broNodes[seedB]); + broNodes[seedB]->setParent(newParentB); + addNodeNLSToNode(broNodes[seedB], newParentB); + // split other nodes into A and B + for (int i = 0; i < broNodes.size(); i++) { + if (i != seedA and i != seedB) { + if (corCFDistance[i][seedA] < corCFDistance[i][seedB]) { + newParentA->setChild(broNodes[i]); + addNodeNLSToNode(broNodes[i], newParentA); + broNodes[i]->clearParents(); + broNodes[i]->setParent(newParentA); + } else { + newParentB->setChild(broNodes[i]); + addNodeNLSToNode(broNodes[i], newParentB); + broNodes[i]->clearParents(); + broNodes[i]->setParent(newParentB); } + } + } + if (CurNodeIsLeaf) { + updateNLS(parParent, point, true); + } + + if (parParent->getChildren().size() <= this->cfTree->getB()) { + // SESAME_DEBUG("b < B, remove the old node and insert the new nodeA + // and nodeB into the parent node"); + break; + } else { + // SESAME_DEBUG("b >= B, parent node of the current interior node + // capacity reaches the threshold B"); + curNode = curNode->getParent(); + parent = parParent; + CurNodeIsLeaf = false; } + } } + } } diff --git a/src/Algorithm/CluStream.cpp b/src/Algorithm/CluStream.cpp index 74050760..4598e027 100644 --- a/src/Algorithm/CluStream.cpp +++ b/src/Algorithm/CluStream.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by 1124a on 2021/8/16. @@ -15,51 +16,51 @@ * initialData:input initial data *@Return: void */ -SESAME::CluStream::CluStream(param_t &cmd_params) -{ - this->param = cmd_params; - this->CluStreamParam.num_points = cmd_params.num_points; - this->CluStreamParam.dim = cmd_params.dim; - this->CluStreamParam.num_clusters = cmd_params.num_online_clusters; - this->CluStreamParam.num_last_arr = cmd_params.num_last_arr; - this->CluStreamParam.time_window = cmd_params.time_window; - this->CluStreamParam.time_interval = cmd_params.time_interval; - this->CluStreamParam.num_offline_clusters = cmd_params.k; - this->CluStreamParam.radius = cmd_params.radius; - this->CluStreamParam.seed = cmd_params.seed; - this->CluStreamParam.buf_size = cmd_params.buf_size; - if (this->CluStreamParam.offline_time_window > this->CluStreamParam.num_points) - this->CluStreamParam.offline_time_window = cmd_params.offline_time_window; - this->pointsFitted = 0; - this->pointsForgot = 0; - this->pointsMerged = 0; +SESAME::CluStream::CluStream(param_t &cmd_params) { + this->param = cmd_params; + this->CluStreamParam.num_points = cmd_params.num_points; + this->CluStreamParam.dim = cmd_params.dim; + this->CluStreamParam.num_clusters = cmd_params.num_online_clusters; + this->CluStreamParam.num_last_arr = cmd_params.num_last_arr; + this->CluStreamParam.time_window = cmd_params.time_window; + this->CluStreamParam.time_interval = cmd_params.time_interval; + this->CluStreamParam.num_offline_clusters = cmd_params.k; + this->CluStreamParam.radius = cmd_params.radius; + this->CluStreamParam.seed = cmd_params.seed; + this->CluStreamParam.buf_size = cmd_params.buf_size; + if (this->CluStreamParam.offline_time_window > + this->CluStreamParam.num_points) + this->CluStreamParam.offline_time_window = cmd_params.offline_time_window; + this->pointsFitted = 0; + this->pointsForgot = 0; + this->pointsMerged = 0; } SESAME::CluStream::~CluStream() {} -void SESAME::CluStream::initOffline(vector &initData, vector &initialData) -{ - for (int i = 0; i < CluStreamParam.num_clusters; i++) - { - microClusters.push_back(DataStructureFactory::createMicroCluster(CluStreamParam.dim, i)); - } - std::vector centers; - std::vector> oldGroups, newGroups; - this->kmeans->runKMeans(CluStreamParam.num_clusters, CluStreamParam.buf_size, centers, initData, - oldGroups, newGroups, CluStreamParam.seed, true); - // store the result input output - this->kmeans->storeResult(oldGroups, initialData); - for (int i = 0; i < CluStreamParam.buf_size; i++) - { - int clusterId = initialData[i]->getClusteringCenter(); - // SESAME_INFO("the belonging micro cluster id is !"< &initData, + vector &initialData) { + for (int i = 0; i < CluStreamParam.num_clusters; i++) { + microClusters.push_back( + DataStructureFactory::createMicroCluster(CluStreamParam.dim, i)); + } + std::vector centers; + std::vector> oldGroups, newGroups; + this->kmeans->runKMeans(CluStreamParam.num_clusters, CluStreamParam.buf_size, + centers, initData, oldGroups, newGroups, + CluStreamParam.seed, true); + // store the result input output + this->kmeans->storeResult(oldGroups, initialData); + for (int i = 0; i < CluStreamParam.buf_size; i++) { + int clusterId = initialData[i]->getClusteringCenter(); + // SESAME_INFO("the belonging micro cluster id is !"<getIndex() - startTime; - if (microClusters[clusterId]->weight == 0) - microClusters[clusterId]->Init(initialData[i], timestamp); - else - microClusters[clusterId]->insert(initialData[i], timestamp); - } + int timestamp = initialData[i]->getIndex() - startTime; + if (microClusters[clusterId]->weight == 0) + microClusters[clusterId]->Init(initialData[i], timestamp); + else + microClusters[clusterId]->insert(initialData[i], timestamp); + } } /** @@ -67,176 +68,166 @@ void SESAME::CluStream::initOffline(vector &initData, vector * first determine closest clusters to the data, * then judge whether it locates into the maximum boundary of this micro cluster * 1. data object fits into closest micro cluster; - * 2. data does not fit in any cluster, delete oldest one & create a new cluster; + * 2. data does not fit in any cluster, delete oldest one & create a new + * cluster; * 3. merge two closest clusters & create a new cluster. * @Param: data: input data object * @Return: store the output result(with computed clustering center) into ??? */ -void SESAME::CluStream::incrementalCluster(PointPtr data) -{ // 1. Determine closest clusters - MicroClusterPtr closestCluster; - double minDistance = doubleMax; - for (int i = 0; i < this->CluStreamParam.num_clusters; i++) - { - double dist = microClusters[i]->calCentroidDistance(data); - if (dist < minDistance) - { - closestCluster = microClusters[i]->copy(); - minDistance = dist; - } - } - double radius = calRadius(closestCluster); - if (minDistance < radius) - { - insertIntoCluster(data, closestCluster); - return; - } - /** 3. Date does not fit -- free - * some space to insert a new micro cluster - * */ - // 3.1 delete oldest one & create a new cluster - if (!deleteCreateCluster(data)) - { - // 3.2 merge two closest clusters & create a new cluster - MergeCreateCluster(data); +void SESAME::CluStream::incrementalCluster( + PointPtr data) { // 1. Determine closest clusters + MicroClusterPtr closestCluster; + double minDistance = doubleMax; + for (int i = 0; i < this->CluStreamParam.num_clusters; i++) { + double dist = microClusters[i]->calCentroidDistance(data); + if (dist < minDistance) { + closestCluster = microClusters[i]->copy(); + minDistance = dist; } + } + double radius = calRadius(closestCluster); + if (minDistance < radius) { + insertIntoCluster(data, closestCluster); + return; + } + /** 3. Date does not fit -- free + * some space to insert a new micro cluster + * */ + // 3.1 delete oldest one & create a new cluster + if (!deleteCreateCluster(data)) { + // 3.2 merge two closest clusters & create a new cluster + MergeCreateCluster(data); + } } // Calculate and return the value of radius -double SESAME::CluStream::calRadius(MicroClusterPtr closestCluster) -{ - double radius; - if (closestCluster->weight == 1) - { - // Special case: estimate radius by determining the distance to the - // next closest cluster - radius = doubleMax; - dataPoint centroid = closestCluster->getCentroid(); - for (int i = 0; i < this->CluStreamParam.num_clusters; i++) - { - if (microClusters[i]->id == closestCluster->id) - { - continue; - } - double dist = - distance(microClusters[i]->getCentroid(), centroid, this->CluStreamParam.dim); - radius = std::min(dist, radius); - } +double SESAME::CluStream::calRadius(MicroClusterPtr closestCluster) { + double radius; + if (closestCluster->weight == 1) { + // Special case: estimate radius by determining the distance to the + // next closest cluster + radius = doubleMax; + dataPoint centroid = closestCluster->getCentroid(); + for (int i = 0; i < this->CluStreamParam.num_clusters; i++) { + if (microClusters[i]->id == closestCluster->id) { + continue; + } + double dist = distance(microClusters[i]->getCentroid(), centroid, + this->CluStreamParam.dim); + radius = std::min(dist, radius); } - else - radius = closestCluster->getRadius(this->CluStreamParam.radius); - return radius; + } else + radius = closestCluster->getRadius(this->CluStreamParam.radius); + return radius; } // Date fits case -void SESAME::CluStream::insertIntoCluster(PointPtr data, MicroClusterPtr operateCluster) -{ - // SESAME_INFO("This data fits"); - pointsFitted++; +void SESAME::CluStream::insertIntoCluster(PointPtr data, + MicroClusterPtr operateCluster) { + // SESAME_INFO("This data fits"); + pointsFitted++; - int timestamp = data->getIndex() - startTime; - operateCluster->insert(data, timestamp); + int timestamp = data->getIndex() - startTime; + operateCluster->insert(data, timestamp); } // Delete the oldest cluster and create new one case -bool SESAME::CluStream::deleteCreateCluster(PointPtr data) -{ - // 3.1 Try to forget old micro clusters +bool SESAME::CluStream::deleteCreateCluster(PointPtr data) { + // 3.1 Try to forget old micro clusters - int elapsedTime = data->getIndex(); - int threshold = 0; // Kernels before this can be forgotten - if (elapsedTime - this->CluStreamParam.time_window >= 0) - threshold = elapsedTime - this->CluStreamParam.time_window; - for (int i = 0; i < this->CluStreamParam.num_clusters; i++) - { - if (microClusters[i]->getRelevanceStamp(this->CluStreamParam.num_last_arr) < threshold) - { - // SESAME_INFO("Need to delete"); - int newId = this->CluStreamParam.num_clusters + pointsForgot + pointsMerged; - delMicroClusters.push_back(microClusters[i]); - DataStructureFactory::clearMicroCluster(microClusters[i]); - microClusters[i] = DataStructureFactory::createMicroCluster(CluStreamParam.dim, newId); - microClusters[i]->Init(std::move(data), elapsedTime); - pointsForgot++; + int elapsedTime = data->getIndex(); + int threshold = 0; // Kernels before this can be forgotten + if (elapsedTime - this->CluStreamParam.time_window >= 0) + threshold = elapsedTime - this->CluStreamParam.time_window; + for (int i = 0; i < this->CluStreamParam.num_clusters; i++) { + if (microClusters[i]->getRelevanceStamp(this->CluStreamParam.num_last_arr) < + threshold) { + // SESAME_INFO("Need to delete"); + int newId = + this->CluStreamParam.num_clusters + pointsForgot + pointsMerged; + delMicroClusters.push_back(microClusters[i]); + DataStructureFactory::clearMicroCluster(microClusters[i]); + microClusters[i] = + DataStructureFactory::createMicroCluster(CluStreamParam.dim, newId); + microClusters[i]->Init(std::move(data), elapsedTime); + pointsForgot++; - return true; - } + return true; } + } - return false; + return false; } // Merge two closest clusters & create a new cluster -void SESAME::CluStream::MergeCreateCluster(PointPtr data) -{ - // SESAME_INFO("Micro cluster needs to merge"); - unsigned int closestA = 0; - unsigned int closestB = 0; - double minDistance = doubleMax; - for (int i = 0; i < this->CluStreamParam.num_clusters; i++) - { // O(n(n+1)/2) - dataPoint centroidA = microClusters[i]->getCentroid(); - for (int j = i + 1; j < this->CluStreamParam.num_clusters; j++) - { - double dist = - distance(centroidA, microClusters[j]->getCentroid(), this->CluStreamParam.dim); - if (dist < minDistance) - { - minDistance = dist; - closestA = i; - closestB = j; - } - } +void SESAME::CluStream::MergeCreateCluster(PointPtr data) { + // SESAME_INFO("Micro cluster needs to merge"); + unsigned int closestA = 0; + unsigned int closestB = 0; + double minDistance = doubleMax; + for (int i = 0; i < this->CluStreamParam.num_clusters; i++) { // O(n(n+1)/2) + dataPoint centroidA = microClusters[i]->getCentroid(); + for (int j = i + 1; j < this->CluStreamParam.num_clusters; j++) { + double dist = distance(centroidA, microClusters[j]->getCentroid(), + this->CluStreamParam.dim); + if (dist < minDistance) { + minDistance = dist; + closestA = i; + closestB = j; + } } - int newId = this->CluStreamParam.num_clusters + pointsForgot + pointsMerged; - microClusters[closestA]->merge(microClusters[closestB]); - int elapsedTime = data->getIndex(); - DataStructureFactory::clearMicroCluster(microClusters[closestB]); - microClusters[closestB] = DataStructureFactory::createMicroCluster(CluStreamParam.dim, newId); - microClusters[closestB]->Init(std::move(data), elapsedTime); - pointsMerged++; - return; + } + int newId = this->CluStreamParam.num_clusters + pointsForgot + pointsMerged; + microClusters[closestA]->merge(microClusters[closestB]); + int elapsedTime = data->getIndex(); + DataStructureFactory::clearMicroCluster(microClusters[closestB]); + microClusters[closestB] = + DataStructureFactory::createMicroCluster(CluStreamParam.dim, newId); + microClusters[closestB]->Init(std::move(data), elapsedTime); + pointsMerged++; + return; } -void SESAME::CluStream::microClusterToPoint(std::vector µClusters, - vector &points) const -{ - for (int i = 0; i < this->CluStreamParam.num_clusters; i++) - { - PointPtr point = DataStructureFactory::createPoint(i, microClusters[i]->weight, - microClusters[i]->centroid.size(), 0); - for (SESAME::dataPoint::size_type j = 0; j < microClusters[i]->centroid.size(); j++) - point->setFeatureItem(microClusters[i]->centroid[j], j); - // points; - points.push_back(point); - } +void SESAME::CluStream::microClusterToPoint( + std::vector µClusters, + vector &points) const { + for (int i = 0; i < this->CluStreamParam.num_clusters; i++) { + PointPtr point = + GenericFactory::New(microClusters[i]->centroid.size(), i); + point->weight = microClusters[i]->weight; + for (SESAME::dataPoint::size_type j = 0; + j < microClusters[i]->centroid.size(); j++) + point->setFeatureItem(microClusters[i]->centroid[j], j); + // points; + points.push_back(point); + } } -double SESAME::CluStream::distance(dataPoint a, dataPoint b, int dim) -{ - double temp = 0; - for (int i = 0; i < dim; i++) - { - double diff = b[i] - a[i]; - temp += diff * diff; - } - return sqrt(temp); +double SESAME::CluStream::distance(dataPoint a, dataPoint b, int dim) { + double temp = 0; + for (int i = 0; i < dim; i++) { + double diff = b[i] - a[i]; + temp += diff * diff; + } + return sqrt(temp); } -void SESAME::CluStream::Init() -{ - this->window = WindowFactory::createLandmarkWindow(); - this->window->pyramidalWindow.time_interval = this->CluStreamParam.time_interval; - this->lastUpdateTime = 0; - window->initPyramidalWindow(this->window->pyramidalWindow.time_interval); - sum_timer.Tick(); +void SESAME::CluStream::Init() { + this->window = WindowFactory::createLandmarkWindow(); + this->window->pyramidalWindow.time_interval = + this->CluStreamParam.time_interval; + this->lastUpdateTime = 0; + window->initPyramidalWindow(this->window->pyramidalWindow.time_interval); + sum_timer.Tick(); } /** * @Description: online clustering of Clustream, - * insert every data object and cluster them incrementally,it has three conditions: + * insert every data object and cluster them incrementally,it has three + * conditions: * 1. data object fits into closest micro cluster; - * 2. data does not fit in any cluster, delete oldest one & create a new cluster; + * 2. data does not fit in any cluster, delete oldest one & create a new + * cluster; * 3. merge two closest clusters & create a new cluster. * @Param:dimension: dimension of data object * pointNumber: total number of data objects collected in workload @@ -247,114 +238,109 @@ void SESAME::CluStream::Init() * input: vector of data streams * @Return: store the output result(with computed clustering center) into ???// */ -void SESAME::CluStream::RunOnline(SESAME::PointPtr input) -{ - ds_timer.Tick(); - if (!this->initilized) - { - this->initialInputs.push_back(input->copy()); - this->startTime = initialInputs.at(0)->getIndex(); - if (this->initialInputs.size() == this->CluStreamParam.buf_size) - { - ds_timer.Tock(); - ref_timer.Tick(); - vector initData; // initialData - initOffline(this->initialInputs, initData); - ref_timer.Tock(); - ds_timer.Tick(); - window->pyramidalWindowProcess(startTime, microClusters); - this->initilized = true; - } +void SESAME::CluStream::RunOnline(SESAME::PointPtr input) { + ds_timer.Tick(); + if (!this->initilized) { + this->initialInputs.push_back(input->copy()); + this->startTime = initialInputs.at(0)->getIndex(); + if (this->initialInputs.size() == this->CluStreamParam.buf_size) { + ds_timer.Tock(); + ref_timer.Tick(); + vector initData; // initialData + initOffline(this->initialInputs, initData); + ref_timer.Tock(); + ds_timer.Tick(); + window->pyramidalWindowProcess(startTime, microClusters); + this->initilized = true; } - else - { - int interval; - interval = input->getIndex() - lastUpdateTime; + } else { + int interval; + interval = input->getIndex() - lastUpdateTime; - // TODO we just assume take a snapshot per 1000 points - if (interval >= 1000) - { - window->pyramidalWindowProcess(interval, microClusters); - lastUpdateTime = input->getIndex(); - } - incrementalCluster(input->copy()); + // TODO we just assume take a snapshot per 1000 points + if (interval >= 1000) { + window->pyramidalWindowProcess(interval, microClusters); + lastUpdateTime = input->getIndex(); } - ds_timer.Tock(); - lat_timer.Add(input->toa); + incrementalCluster(input->copy()); + } + ds_timer.Tock(); + lat_timer.Add(input->toa); } -void SESAME::CluStream::RunOffline(SESAME::DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - vector> groups; - int elapsedTime = this->CluStreamParam.num_points; - int landmarkTime = elapsedTime - this->CluStreamParam.offline_time_window; - if (this->CluStreamParam.offline_time_window > elapsedTime) landmarkTime = 0; - // SESAME_INFO("Start offline..."); - SESAME::SnapshotPtr landmarkSnapshot; - SESAME::SnapshotPtr subtractMiroCluster; - // If offline_time_window = 0, Only Observe the end results of micro clusters - subtractMiroCluster = DataStructureFactory::createSnapshot(microClusters, elapsedTime); - - // The offline is to observe a process of data stream clustering - if (CluStreamParam.offline_time_window != 0) - { // - landmarkSnapshot = - SESAME::Snapshot::findSnapshot(window->orderSnapShots, landmarkTime, elapsedTime, - window->pyramidalWindow.currentOrder); +void SESAME::CluStream::RunOffline(SESAME::DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + vector> groups; + int elapsedTime = this->CluStreamParam.num_points; + int landmarkTime = elapsedTime - this->CluStreamParam.offline_time_window; + if (this->CluStreamParam.offline_time_window > elapsedTime) + landmarkTime = 0; + // SESAME_INFO("Start offline..."); + SESAME::SnapshotPtr landmarkSnapshot; + SESAME::SnapshotPtr subtractMiroCluster; + // If offline_time_window = 0, Only Observe the end results of micro clusters + subtractMiroCluster = + DataStructureFactory::createSnapshot(microClusters, elapsedTime); - // SESAME_INFO("Landmark Miro Cluster is..."); - for (int i = 0; i < CluStreamParam.num_clusters; i++) - { - std::stringstream re2; - std::copy(landmarkSnapshot->microClusters[i]->id.begin(), - landmarkSnapshot->microClusters[i]->id.end(), - std::ostream_iterator(re2, " ")); - // SESAME_INFO("The ID is " << re2.str() << "weight is " << - // landmarkSnapshot->microClusters[i]->weight); - } - if (landmarkSnapshot->elapsedTime == -1) landmarkSnapshot = subtractMiroCluster; + // The offline is to observe a process of data stream clustering + if (CluStreamParam.offline_time_window != 0) { // + landmarkSnapshot = SESAME::Snapshot::findSnapshot( + window->orderSnapShots, landmarkTime, elapsedTime, + window->pyramidalWindow.currentOrder); - subtractMiroCluster = SESAME::Snapshot::substractSnapshot( - subtractMiroCluster, landmarkSnapshot, this->CluStreamParam.num_clusters); + // SESAME_INFO("Landmark Miro Cluster is..."); + for (int i = 0; i < CluStreamParam.num_clusters; i++) { + std::stringstream re2; + std::copy(landmarkSnapshot->microClusters[i]->id.begin(), + landmarkSnapshot->microClusters[i]->id.end(), + std::ostream_iterator(re2, " ")); + // SESAME_INFO("The ID is " << re2.str() << "weight is " << + // landmarkSnapshot->microClusters[i]->weight); } - // SESAME_INFO("subtract Miro Cluster is..."); - for (int i = 0; i < CluStreamParam.num_clusters; i++) - { - std::stringstream re2; - std::copy(subtractMiroCluster->microClusters[i]->id.begin(), - subtractMiroCluster->microClusters[i]->id.end(), - std::ostream_iterator(re2, " ")); - // SESAME_INFO("The ID is " << re2.str() << "weight is " << - // subtractMiroCluster->microClusters[i]->weight); - } - vector TransformedSnapshot; - microClusterToPoint(subtractMiroCluster->microClusters, TransformedSnapshot); + if (landmarkSnapshot->elapsedTime == -1) + landmarkSnapshot = subtractMiroCluster; - // SESAME_INFO("offline Cluster Number " << this->CluStreamParam.num_offline_clusters << "Total - // number of p: " << TransformedSnapshot.size()); + subtractMiroCluster = SESAME::Snapshot::substractSnapshot( + subtractMiroCluster, landmarkSnapshot, + this->CluStreamParam.num_clusters); + } + // SESAME_INFO("subtract Miro Cluster is..."); + for (int i = 0; i < CluStreamParam.num_clusters; i++) { + std::stringstream re2; + std::copy(subtractMiroCluster->microClusters[i]->id.begin(), + subtractMiroCluster->microClusters[i]->id.end(), + std::ostream_iterator(re2, " ")); + // SESAME_INFO("The ID is " << re2.str() << "weight is " << + // subtractMiroCluster->microClusters[i]->weight); + } + vector TransformedSnapshot; + microClusterToPoint(subtractMiroCluster->microClusters, TransformedSnapshot); - std::vector centers; - std::vector> oldGroups, newGroups; + // SESAME_INFO("offline Cluster Number " << + // this->CluStreamParam.num_offline_clusters << "Total number of p: " << + // TransformedSnapshot.size()); - this->kmeans->Run(param, centers, sinkPtr); - // this->kmeans->runKMeans(this->CluStreamParam.num_offline_clusters, - // this->CluStreamParam.num_clusters,centers, - // TransformedSnapshot, oldGroups, newGroups, this->CluStreamParam.seed, - // true); - // Count overall time + std::vector centers; + std::vector> oldGroups, newGroups; - // store the result input output - // this->kmeans->produceResult(oldGroups,sinkPtr); - // timerMeter.printTime(true, true,true,false); - for (auto out = this->delMicroClusters.begin(); out != this->delMicroClusters.end(); ++out) - { - PointPtr center = out->get()->getCenter(); - center->setClusteringCenter(-1); - center->setOutlier(true); - sinkPtr->put(center); - } - ref_timer.Tock(); - sum_timer.Tock(); + this->kmeans->Run(param, centers, sinkPtr); + // this->kmeans->runKMeans(this->CluStreamParam.num_offline_clusters, + // this->CluStreamParam.num_clusters,centers, + // TransformedSnapshot, oldGroups, newGroups, + // this->CluStreamParam.seed, true); + // Count overall time + + // store the result input output + // this->kmeans->produceResult(oldGroups,sinkPtr); + // timerMeter.printTime(true, true,true,false); + for (auto out = this->delMicroClusters.begin(); + out != this->delMicroClusters.end(); ++out) { + PointPtr center = out->get()->getCenter(); + center->setClusteringCenter(-1); + center->setOutlier(true); + sinkPtr->put(center); + } + ref_timer.Tock(); + sum_timer.Tock(); } diff --git a/src/Algorithm/DBStream.cpp b/src/Algorithm/DBStream.cpp index bb46edfb..82d37fce 100644 --- a/src/Algorithm/DBStream.cpp +++ b/src/Algorithm/DBStream.cpp @@ -15,17 +15,16 @@ * base: decay function base -- Normally 2 * @Return: void */ -SESAME::DBStream::DBStream(param_t &cmd_params) -{ - this->param = cmd_params; - this->dbStreamParams.num_points = cmd_params.num_points; - this->dbStreamParams.dim = cmd_params.dim; - this->dbStreamParams.radius = cmd_params.radius; - this->dbStreamParams.lambda = cmd_params.lambda; - this->dbStreamParams.clean_interval = cmd_params.clean_interval; - this->dbStreamParams.min_weight = cmd_params.min_weight; - this->dbStreamParams.alpha = cmd_params.alpha; - this->dbStreamParams.base = cmd_params.base; +SESAME::DBStream::DBStream(param_t &cmd_params) { + this->param = cmd_params; + this->dbStreamParams.num_points = cmd_params.num_points; + this->dbStreamParams.dim = cmd_params.dim; + this->dbStreamParams.radius = cmd_params.radius; + this->dbStreamParams.lambda = cmd_params.lambda; + this->dbStreamParams.clean_interval = cmd_params.clean_interval; + this->dbStreamParams.min_weight = cmd_params.min_weight; + this->dbStreamParams.alpha = cmd_params.alpha; + this->dbStreamParams.base = cmd_params.base; } SESAME::DBStream::~DBStream() = default; @@ -34,67 +33,62 @@ SESAME::DBStream::~DBStream() = default; * @Param: void * @Return: void */ -void SESAME::DBStream::Init() -{ - this->dampedWindow = - WindowFactory::createDampedWindow(dbStreamParams.base, dbStreamParams.lambda); - this->pointArrivingTime = 0; - this->lastCleanTime = 0; - this->lastArrivingTime = 0; - this->weakEntry = - pow(dbStreamParams.base, (-1) * dbStreamParams.lambda * dbStreamParams.clean_interval); - this->aWeakEntry = weakEntry * dbStreamParams.alpha; +void SESAME::DBStream::Init() { + this->dampedWindow = WindowFactory::createDampedWindow(dbStreamParams.base, + dbStreamParams.lambda); + this->pointArrivingTime = 0; + this->lastCleanTime = 0; + this->lastArrivingTime = 0; + this->weakEntry = pow(dbStreamParams.base, (-1) * dbStreamParams.lambda * + dbStreamParams.clean_interval); + this->aWeakEntry = weakEntry * dbStreamParams.alpha; - // std::cout<<"weakEntry"<microClusterIndex = -1; - connectedRegions = ConnectedRegions(dbStreamParams.alpha, dbStreamParams.min_weight); - sum_timer.Tick(); + // std::cout<<"weakEntry"<microClusterIndex = -1; + connectedRegions = + ConnectedRegions(dbStreamParams.alpha, dbStreamParams.min_weight); + sum_timer.Tick(); } /** - * @Description: online clustering stage, input data point incrementally and update the MC list and - * weight adjacency lists, + * @Description: online clustering stage, input data point incrementally and + * update the MC list and weight adjacency lists, * @Param: void * @Return: void */ -void SESAME::DBStream::RunOnline(PointPtr input) -{ - if (!this->isInitial) - { - // SESAME_INFO("Start initialize..."); - Init(); - this->isInitial = true; - update(input); - } - else - { - update(input); - lastArrivingTime = pointArrivingTime; - lastArrivingTime0 = pointArrivingTime0; - } - lat_timer.Add(input->toa); +void SESAME::DBStream::RunOnline(PointPtr input) { + if (!this->isInitial) { + // SESAME_INFO("Start initialize..."); + Init(); + this->isInitial = true; + update(input); + } else { + update(input); + lastArrivingTime = pointArrivingTime; + lastArrivingTime0 = pointArrivingTime0; + } + lat_timer.Add(input->toa); } -void SESAME::DBStream::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - SESAME_INFO("micro clusters " << microClusters.size()); - SESAME_INFO("weightedAdjacencyList " << weightedAdjacencyList.size()); - // std::cout<<"micro clusters "< points = connectedRegions.ResultsToDataSink(); - for (auto i = 0; i < points.size(); i++) - { - auto res = points[i]; - res->setClusteringCenter(i); - sinkPtr->put(res); - } +void SESAME::DBStream::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + SESAME_INFO("micro clusters " << microClusters.size()); + SESAME_INFO("weightedAdjacencyList " << weightedAdjacencyList.size()); + // std::cout<<"micro clusters "< points = connectedRegions.ResultsToDataSink(); + for (auto i = 0; i < points.size(); i++) { + auto res = points[i]; + res->setClusteringCenter(i); + sinkPtr->put(res); + } - // timerMeter.printTime(false,false,true,false); - ref_timer.Tock(); - sum_timer.Tock(); + // timerMeter.printTime(false,false,true,false); + ref_timer.Tock(); + sum_timer.Tock(); } /** * @Description: Insert data point into existing MCs, @@ -107,165 +101,162 @@ void SESAME::DBStream::RunOffline(DataSinkPtr sinkPtr) * @Param: data point * @Return: void */ -void SESAME::DBStream::update(PointPtr dataPoint) -{ - win_timer.Tick(); - this->pointArrivingTime = dataPoint->getIndex(); - clock_gettime(CLOCK_REALTIME, &this->pointArrivingTime0); - double decayFactor = dampedWindow->decayFunction(lastArrivingTime, this->pointArrivingTime); - // TODO this one is using timespec to calculate time - // double decayFactor0 = dampedWindow->decayFunction(lastArrivingTime,this->pointArrivingTime); - this->microClusterNN = findFixedRadiusNN(dataPoint, decayFactor); // decayFactor - std::vector::size_type sizeNN = microClusterNN.size(); - win_timer.Tock(); +void SESAME::DBStream::update(PointPtr dataPoint) { + win_timer.Tick(); + this->pointArrivingTime = dataPoint->getIndex(); + clock_gettime(CLOCK_REALTIME, &this->pointArrivingTime0); + double decayFactor = + dampedWindow->decayFunction(lastArrivingTime, this->pointArrivingTime); + // TODO this one is using timespec to calculate time + // double decayFactor0 = + // dampedWindow->decayFunction(lastArrivingTime,this->pointArrivingTime); + this->microClusterNN = + findFixedRadiusNN(dataPoint, decayFactor); // decayFactor + std::vector::size_type sizeNN = microClusterNN.size(); + win_timer.Tock(); - /* * - * If this point fits in no micro clusters - * */ - if (microClusterNN.empty()) - { + /* * + * If this point fits in no micro clusters + * */ + if (microClusterNN.empty()) { + ds_timer.Tick(); + microClusterIndex++; + MicroClusterPtr newMicroCluster = + SESAME::DataStructureFactory::createMicroCluster( + dbStreamParams.dim, microClusterIndex, dataPoint, + dbStreamParams.radius); + microClusters.push_back(newMicroCluster); + microClusterNN.push_back(newMicroCluster); + ds_timer.Tock(); + } else { + // SESAME_INFO("microClusterNN size " << sizeNN); + for (int i = 0; i < sizeNN; i++) { + ds_timer.Tick(); + microClusterNN[i]->insert(dataPoint); // just update weight // + ds_timer.Tock(); + // std::cout<<" cluster "<id.front()<<"th weight is + // "<weight<insert(dataPoint); // just update weight // - ds_timer.Tock(); - // std::cout<<" cluster "<id.front()<<"th weight is - // "<weight<updateTime; - double decayValue = - dampedWindow->decayFunction(startT, this->pointArrivingTime); - // Timespec - // timespec startT= weightedAdjacencyList[microClusterPair]->updateTime0; - // double decayValue0 = - // dampedWindow->decayFunction(startT,this->pointArrivingTime0); - weightedAdjacencyList[microClusterPair]->add(this->pointArrivingTime, - decayValue); - } - else - { - // SESAME_INFO("insert Sij"); - AdjustedWeightPtr adjustedWeight = - SESAME::DataStructureFactory::createAdjustedWeight( - 1, this->pointArrivingTime, this->pointArrivingTime0); - DensityGraph densityGraph(microClusterPair, adjustedWeight); - weightedAdjacencyList.insert(densityGraph); - } - win_timer.Tock(); - } + win_timer.Tick(); + if (weightedAdjacencyList.find(microClusterPair) != + weightedAdjacencyList.end()) { + // SESAME_INFO("update Sij"); + // update existing micro cluster pair in the graph + int startT = weightedAdjacencyList[microClusterPair]->updateTime; + double decayValue = + dampedWindow->decayFunction(startT, this->pointArrivingTime); + // Timespec + // timespec startT= + // weightedAdjacencyList[microClusterPair]->updateTime0; double + // decayValue0 = + // dampedWindow->decayFunction(startT,this->pointArrivingTime0); + weightedAdjacencyList[microClusterPair]->add(this->pointArrivingTime, + decayValue); + } else { + // SESAME_INFO("insert Sij"); + AdjustedWeightPtr adjustedWeight = + SESAME::DataStructureFactory::createAdjustedWeight( + 1, this->pointArrivingTime, this->pointArrivingTime0); + DensityGraph densityGraph(microClusterPair, adjustedWeight); + weightedAdjacencyList.insert(densityGraph); } - ds_timer.Tick(); - if (checkMove(microClusterNN)) - for (const MicroClusterPtr µCluster : microClusterNN) microCluster->move(); - ds_timer.Tock(); + win_timer.Tock(); + } } - out_timer.Tick(); + ds_timer.Tick(); + if (checkMove(microClusterNN)) + for (const MicroClusterPtr µCluster : microClusterNN) + microCluster->move(); + ds_timer.Tock(); + } + out_timer.Tick(); - // if (((pointArrivingTime-this->lastCleanTime)/CLOCKS_PER_SEC)>= dbStreamParams.clean_interval - // && dataPoint->getIndex()!=0) - if ((pointArrivingTime) % dbStreamParams.clean_interval == 0) - // if(interval/1000L%dbStreamParams.clean_interval==0 ) - { - cleanUp(pointArrivingTime); // pointArrivingTime + // if (((pointArrivingTime-this->lastCleanTime)/CLOCKS_PER_SEC)>= + // dbStreamParams.clean_interval + // && dataPoint->getIndex()!=0) + if ((pointArrivingTime) % dbStreamParams.clean_interval == 0) + // if(interval/1000L%dbStreamParams.clean_interval==0 ) + { + cleanUp(pointArrivingTime); // pointArrivingTime - this->lastCleanTime = this->pointArrivingTime; - } - out_timer.Tock(); + this->lastCleanTime = this->pointArrivingTime; + } + out_timer.Tock(); } -std::vector SESAME::DBStream::findFixedRadiusNN(PointPtr dataPoint, - double decayFactor) -{ - std::vector result; - std::vector::size_type iter; - // todo this is a test for time - for (iter = 0; iter < microClusters.size(); iter++) - { - microClusters.at(iter)->decayWeight(decayFactor); - double distance = microClusters.at(iter)->getDistance(dataPoint); - // SESAME_INFO("distance " << distance) - if (distance < dbStreamParams.radius) result.push_back(microClusters.at(iter)); - } - return result; +std::vector +SESAME::DBStream::findFixedRadiusNN(PointPtr dataPoint, double decayFactor) { + std::vector result; + std::vector::size_type iter; + // todo this is a test for time + for (iter = 0; iter < microClusters.size(); iter++) { + microClusters.at(iter)->decayWeight(decayFactor); + double distance = microClusters.at(iter)->getDistance(dataPoint); + // SESAME_INFO("distance " << distance) + if (distance < dbStreamParams.radius) + result.push_back(microClusters.at(iter)); + } + return result; } -bool SESAME::DBStream::checkMove(std::vector microClustersList) const -{ - if (!microClustersList.empty()) - { - std::vector::size_type i, j; - for (i = 0; i < microClustersList.size(); i++) - { - for (j = i + 1; j < microClustersList.size(); j++) - { - double distance = microClustersList.at(i)->getDistance(microClustersList.at(j)); - if (distance < dbStreamParams.radius) return false; - } - } +bool SESAME::DBStream::checkMove( + std::vector microClustersList) const { + if (!microClustersList.empty()) { + std::vector::size_type i, j; + for (i = 0; i < microClustersList.size(); i++) { + for (j = i + 1; j < microClustersList.size(); j++) { + double distance = + microClustersList.at(i)->getDistance(microClustersList.at(j)); + if (distance < dbStreamParams.radius) + return false; + } } - return true; + } + return true; } -void SESAME::DBStream::cleanUp(int nowTime) -{ - std::vector removeMicroCluster; - std::vector::size_type iter; - // Check the current micro Clusters whether they have weak MCs - // This just test for remove id - std::vector idList; - for (iter = 0; iter < microClusters.size(); iter++) - { - if (microClusters.at(iter)->weight <= this->weakEntry) - { - removeMicroCluster.push_back(microClusters.at(iter)->copy()); - idList.push_back(microClusters.at(iter)->id.front()); - microClusters.erase(microClusters.begin() + - int(iter)); // Delete this MC from current MC list - } +void SESAME::DBStream::cleanUp(int nowTime) { + std::vector removeMicroCluster; + std::vector::size_type iter; + // Check the current micro Clusters whether they have weak MCs + // This just test for remove id + std::vector idList; + for (iter = 0; iter < microClusters.size(); iter++) { + if (microClusters.at(iter)->weight <= this->weakEntry) { + removeMicroCluster.push_back(microClusters.at(iter)->copy()); + idList.push_back(microClusters.at(iter)->id.front()); + microClusters.erase(microClusters.begin() + + int(iter)); // Delete this MC from current MC list } - auto iterW = weightedAdjacencyList.begin(); - while (iterW != weightedAdjacencyList.end()) - { - auto val1 = iterW->first.microCluster1->id.front(), - val2 = iterW->first.microCluster2->id.front(); - auto exist1 = - std::find_if(removeMicroCluster.begin(), removeMicroCluster.end(), - [&](const MicroClusterPtr &mc) { return mc->id.front() == val1; }); - auto exist2 = - std::find_if(removeMicroCluster.begin(), removeMicroCluster.end(), - [&](const MicroClusterPtr &mc) { return mc->id.front() == val2; }); - if (exist1 != removeMicroCluster.end() || exist2 != removeMicroCluster.end()) - { - iterW = weightedAdjacencyList.erase(iterW); - } - else - { - double decayFactor = dampedWindow->decayFunction(iterW->second->updateTime, nowTime); - // double decayFactor0=dampedWindow->decayFunction(iterW->second->updateTime0,nowTime); - if (iterW->second->getCurrentWeight(decayFactor) < aWeakEntry) - iterW = weightedAdjacencyList.erase(iterW); - else - iterW++; - } + } + auto iterW = weightedAdjacencyList.begin(); + while (iterW != weightedAdjacencyList.end()) { + auto val1 = iterW->first.microCluster1->id.front(), + val2 = iterW->first.microCluster2->id.front(); + auto exist1 = std::find_if( + removeMicroCluster.begin(), removeMicroCluster.end(), + [&](const MicroClusterPtr &mc) { return mc->id.front() == val1; }); + auto exist2 = std::find_if( + removeMicroCluster.begin(), removeMicroCluster.end(), + [&](const MicroClusterPtr &mc) { return mc->id.front() == val2; }); + if (exist1 != removeMicroCluster.end() || + exist2 != removeMicroCluster.end()) { + iterW = weightedAdjacencyList.erase(iterW); + } else { + double decayFactor = + dampedWindow->decayFunction(iterW->second->updateTime, nowTime); + // double + // decayFactor0=dampedWindow->decayFunction(iterW->second->updateTime0,nowTime); + if (iterW->second->getCurrentWeight(decayFactor) < aWeakEntry) + iterW = weightedAdjacencyList.erase(iterW); + else + iterW++; } - // SESAME_INFO("CLEAN! now weightedAdjacencyList size:"<param = cmd_params; } SESAME::DStream::~DStream() = default; -void SESAME::DStream::Init() -{ - sum_timer.Tick(); - ds_timer.Tick(); - dampedWindow = WindowFactory::createDampedWindow(param.lambda, 1); - gap = 1; - dm = -1.0; - dl = -1.0; - NGrids = 1; - minVals = std::vector(param.dim, DBL_MAX); - maxVals = std::vector(param.dim, DBL_MIN); - Coord = std::vector(param.dim, 0); - ds_timer.Tock(); +void SESAME::DStream::Init() { + sum_timer.Tick(); + ds_timer.Tick(); + dampedWindow = WindowFactory::createDampedWindow(param.lambda, 1); + gap = 1; + dm = -1.0; + dl = -1.0; + NGrids = 1; + minVals = std::vector(param.dim, DBL_MAX); + maxVals = std::vector(param.dim, DBL_MIN); + Coord = std::vector(param.dim, 0); + ds_timer.Tock(); } -void SESAME::DStream::RunOnline(PointPtr input) -{ - this->currentTimeStamp = input->getIndex(); - ifReCalculate(input); - GridListUpdate(Coord); // tempCoord - // 5. If tc == gap, then initial clustering - // and - // 6. If tc mod gap == 0, then: - // Detect and remove sporadic grids from grid_list then adjust clustering - if (!init && currentTimeStamp >= gap) - { - // SESAME_INFO("Initial clustering! gap is "<toa); +void SESAME::DStream::RunOnline(PointPtr input) { + this->currentTimeStamp = input->getIndex(); + ifReCalculate(input); + GridListUpdate(Coord); // tempCoord + // 5. If tc == gap, then initial clustering + // and + // 6. If tc mod gap == 0, then: + // Detect and remove sporadic grids from grid_list then adjust clustering + if (!init && currentTimeStamp >= gap) { + // SESAME_INFO("Initial clustering! gap is "<toa); } -void SESAME::DStream::RunOffline(DataSinkPtr sinkPtr) -{ - cout << "num_grids: " << NGrids << endl; - cout << "gap: " << gap << endl; - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - // SESAME_INFO(" cluster list size "<clusterList.size(); iter++) - { - PointPtr point = DataStructureFactory::createPoint(iter, 0, param.dim, 0); - auto count = 0; - for (auto &iterGrid : this->clusterList.at(iter).grids) - { - for (int iterDim = 0; iterDim < param.dim; iterDim++) - { - if (count == 0) point->setFeatureItem(0, iterDim); - point->setFeatureItem( - point->getFeatureItem(iterDim) + iterGrid.first.coordinates[iterDim], iterDim); - if (count == this->clusterList.at(iter).grids.size() - 1) - { - point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, iterDim); - } - } - double weight = gridList.find(iterGrid.first)->second.gridDensity; - point->setWeight(point->getWeight() + weight); - count++; +void SESAME::DStream::RunOffline(DataSinkPtr sinkPtr) { + cout << "num_grids: " << NGrids << endl; + cout << "gap: " << gap << endl; + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + // SESAME_INFO(" cluster list size "<clusterList.size(); iter++) { + PointPtr point = GenericFactory::New(param.dim, iter); + auto count = 0; + for (auto &iterGrid : this->clusterList.at(iter).grids) { + for (int iterDim = 0; iterDim < param.dim; iterDim++) { + if (count == 0) + point->setFeatureItem(0, iterDim); + point->setFeatureItem(point->getFeatureItem(iterDim) + + iterGrid.first.coordinates[iterDim], + iterDim); + if (count == this->clusterList.at(iter).grids.size() - 1) { + point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, + iterDim); } - point->setClusteringCenter(cluID++); - sinkPtr->put(point); + } + double weight = gridList.find(iterGrid.first)->second.gridDensity; + point->setWeight(point->getWeight() + weight); + count++; } - ref_timer.Tock(); - sum_timer.Tock(); + point->setClusteringCenter(cluID++); + sinkPtr->put(point); + } + ref_timer.Tock(); + sum_timer.Tock(); } -void SESAME::DStream::ifReCalculate(PointPtr point) -{ - bool recalculateN = false; - for (int i = 0; i < param.dim; i++) - { - auto feature = point->getFeatureItem(i); - if (feature > maxVals[i]) - { - maxVals[i] = feature; - recalculateN = true; - } - else if (feature < minVals[i]) - { - minVals[i] = feature; - recalculateN = true; - } - Coord[i] = point->getFeatureItem(i) / param.grid_width; +void SESAME::DStream::ifReCalculate(PointPtr point) { + bool recalculateN = false; + for (int i = 0; i < param.dim; i++) { + auto feature = point->getFeatureItem(i); + if (feature > maxVals[i]) { + maxVals[i] = feature; + recalculateN = true; + } else if (feature < minVals[i]) { + minVals[i] = feature; + recalculateN = true; } - if (recalculateN) reCalculateParameter(); + Coord[i] = point->getFeatureItem(i) / param.grid_width; + } + if (recalculateN) + reCalculateParameter(); } -void SESAME::DStream::reCalculateParameter() -{ - int curGridNumber = 1; - for (int i = 0; i < param.dim; i++) - { - int gridNum = floor((maxVals[i] - minVals[i]) / param.grid_width); - if (gridNum <= 0) gridNum = 1; - curGridNumber = curGridNumber * gridNum; - } - double dlBack = param.cl / (curGridNumber * (1.0 - param.lambda)); - double dmBack = param.cm / (curGridNumber * (1.0 - param.lambda)); - if (dlBack < 0) - return; - else - { - this->dl = dlBack; - this->dm = dmBack; - this->NGrids = curGridNumber; - // // SESAME_INFO(" dl = " << this->dl << ", dm = " << this->dm); - // // SESAME_INFO("TOTAL GRIDS ARE "<NGrids); - // Calculate the value for gap using the method defined in eq 26 of Chen and Tu 2007 - double optionA = param.cl / param.cm; - double optionB = ((double)this->NGrids - param.cm) / ((double)this->NGrids - param.cl); - auto de = log(param.lambda); - auto nu = log(max(optionA, optionB)); - auto res = (int)floor(nu / de); - // Ensure that gap is not zero (i.e. if the procedure to calculate gap rounds down to zero, - // then set gap to 1 and adjust clustering every instance) - if (res > 1) - { - gap = res; - } +void SESAME::DStream::reCalculateParameter() { + int curGridNumber = 1; + for (int i = 0; i < param.dim; i++) { + int gridNum = floor((maxVals[i] - minVals[i]) / param.grid_width); + if (gridNum <= 0) + gridNum = 1; + curGridNumber = curGridNumber * gridNum; + } + double dlBack = param.cl / (curGridNumber * (1.0 - param.lambda)); + double dmBack = param.cm / (curGridNumber * (1.0 - param.lambda)); + if (dlBack < 0) + return; + else { + this->dl = dlBack; + this->dm = dmBack; + this->NGrids = curGridNumber; + // // SESAME_INFO(" dl = " << this->dl << ", dm = " << this->dm); + // // SESAME_INFO("TOTAL GRIDS ARE "<NGrids); + // Calculate the value for gap using the method defined in eq 26 of Chen and + // Tu 2007 + double optionA = param.cl / param.cm; + double optionB = + ((double)this->NGrids - param.cm) / ((double)this->NGrids - param.cl); + auto de = log(param.lambda); + auto nu = log(max(optionA, optionB)); + auto res = (int)floor(nu / de); + // Ensure that gap is not zero (i.e. if the procedure to calculate gap + // rounds down to zero, then set gap to 1 and adjust clustering every + // instance) + if (res > 1) { + gap = res; } + } } /* Update the grid list of DStream when data inserting into the grid * */ -void SESAME::DStream::GridListUpdate(std::vector coordinate) -{ - CharacteristicVector characteristicVec; - DensityGrid grid(coordinate); - // 3. If (g not in grid_list) insert dg to grid_list - auto it = this->gridList.find(grid); - if (it == gridList.end()) - { // SESAME_INFO("Insert "<< currentTimeStamp); - auto it2 = this->deletedGrids.find(grid); - if (it2 != deletedGrids.end()) - { - characteristicVec = - CharacteristicVector(currentTimeStamp, it2->second, 1.0, -1, false, dl, dm); - this->deletedGrids.erase(grid); - } - - else - characteristicVec = CharacteristicVector(currentTimeStamp, -1, 1.0, -1, false, dl, dm); - // this->gridList.insert(std::make_pair(grid, characteristicVec)); - this->gridList.insert(std::make_pair(grid, characteristicVec)); +void SESAME::DStream::GridListUpdate(std::vector coordinate) { + CharacteristicVector characteristicVec; + DensityGrid grid(coordinate); + // 3. If (g not in grid_list) insert dg to grid_list + auto it = this->gridList.find(grid); + if (it == gridList.end()) { // SESAME_INFO("Insert "<< currentTimeStamp); + auto it2 = this->deletedGrids.find(grid); + if (it2 != deletedGrids.end()) { + characteristicVec = CharacteristicVector(currentTimeStamp, it2->second, + 1.0, -1, false, dl, dm); + this->deletedGrids.erase(grid); } - // 4. Update the characteristic vector of dg + else - { - // SESAME_INFO("Update Grid"); - ds_timer.Tock(); - win_timer.Tick(); - characteristicVec = it->second; - characteristicVec.densityWithNew(currentTimeStamp, param.lambda); - characteristicVec.updateTime = currentTimeStamp; - it->second = characteristicVec; - win_timer.Tock(); - ds_timer.Tick(); - } + characteristicVec = + CharacteristicVector(currentTimeStamp, -1, 1.0, -1, false, dl, dm); + // this->gridList.insert(std::make_pair(grid, characteristicVec)); + this->gridList.insert(std::make_pair(grid, characteristicVec)); + } + // 4. Update the characteristic vector of dg + else { + // SESAME_INFO("Update Grid"); + ds_timer.Tock(); + win_timer.Tick(); + characteristicVec = it->second; + characteristicVec.densityWithNew(currentTimeStamp, param.lambda); + characteristicVec.updateTime = currentTimeStamp; + it->second = characteristicVec; + win_timer.Tock(); + ds_timer.Tick(); + } } /** * Implements the procedure given in Figure 3 of Chen and Tu 2007 */ -void SESAME::DStream::initialClustering() -{ - // 1. Update the density of all grids in grid_list - // Timer: online grid - updateGridListDensity(); - // 2. Assign each dense grid to a distinct cluster - // and - // 3. Label all other grids as NO_CLASS - auto gridIter = this->gridList.begin(); - HashMap newGridList; - while (gridIter != gridList.end()) - { - DensityGrid grid = gridIter->first; - CharacteristicVector characteristicVecOfG = gridIter->second; - if (characteristicVecOfG.attribute == DENSE) - { - int gridClass = this->clusterList.size(); - characteristicVecOfG.label = gridClass; - GridCluster gridCluster = GridCluster(gridClass); - gridCluster.addGrid(grid); - this->clusterList.push_back(gridCluster); - // SESAME_INFO(" was dense (class "<clusterList.size()); - } - else - characteristicVecOfG.label = NO_CLASS; - // newGridList.insert(std::make_pair(grid, characteristicVecOfG)); - // newGridList= putHashMap(newGridList,grid, characteristicVecOfG); - - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVecOfG; - else - newGridList.insert(std::make_pair(grid, characteristicVecOfG)); - - ++gridIter; - } - this->gridList = newGridList; - // 4. Make changes to grid labels by doing: - // a. For each cluster c - // b. For each outside grid g of c - // c. For each neighbouring grid h of g - // d. If h belongs to c', label c and c' with - // the label of the largest cluster - // e. Else if h is transitional, assign it to c - // f. While changes can be made - while (adjustLabels()) - ; // while changes are being made +void SESAME::DStream::initialClustering() { + // 1. Update the density of all grids in grid_list + // Timer: online grid + updateGridListDensity(); + // 2. Assign each dense grid to a distinct cluster + // and + // 3. Label all other grids as NO_CLASS + auto gridIter = this->gridList.begin(); + HashMap newGridList; + while (gridIter != gridList.end()) { + DensityGrid grid = gridIter->first; + CharacteristicVector characteristicVecOfG = gridIter->second; + if (characteristicVecOfG.attribute == DENSE) { + int gridClass = this->clusterList.size(); + characteristicVecOfG.label = gridClass; + GridCluster gridCluster = GridCluster(gridClass); + gridCluster.addGrid(grid); + this->clusterList.push_back(gridCluster); + // SESAME_INFO(" was dense (class "<clusterList.size()); + } else + characteristicVecOfG.label = NO_CLASS; + // newGridList.insert(std::make_pair(grid, characteristicVecOfG)); + // newGridList= putHashMap(newGridList,grid, characteristicVecOfG); + + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVecOfG; + else + newGridList.insert(std::make_pair(grid, characteristicVecOfG)); + + ++gridIter; + } + this->gridList = newGridList; + // 4. Make changes to grid labels by doing: + // a. For each cluster c + // b. For each outside grid g of c + // c. For each neighbouring grid h of g + // d. If h belongs to c', label c and c' with + // the label of the largest cluster + // e. Else if h is transitional, assign it to c + // f. While changes can be made + while (adjustLabels()) + ; // while changes are being made } /** * Makes first change available to it by following the steps: @@ -247,92 +230,82 @@ void SESAME::DStream::initialClustering() * Else if h is transitional, assign it to c * @return TRUE if a change was made to any cluster's labels, FALSE otherwise */ -bool SESAME::DStream::adjustLabels() -{ - // bool adjust=false; - // a. For each cluster c +bool SESAME::DStream::adjustLabels() { + // bool adjust=false; + // a. For each cluster c + + for (GridCluster &gridCluster : this->clusterList) { + // // SESAME_INFO("Adjusting from cluster "<first; + bool inside = gridIter->second; + // // SESAME_INFO(" Inspecting density grid, grid, standby..."); + + // b. for each OUTSIDE grid of cluster + if (!inside) { + // // SESAME_INFO(" Density grid dg is outside!"); + // c. for each neighbouring grid, of current iter grid + for (const DensityGrid &gridNeighbourhood : grid.getNeighbours()) { + auto it2 = this->gridList.find(gridNeighbourhood); + if (it2 != gridList.end()) { + auto it1 = this->gridList.find(grid); + CharacteristicVector characteristicVec1 = it1->second; + CharacteristicVector characteristicVec2 = it2->second; + int class1 = characteristicVec1.label; + int class2 = characteristicVec2.label; + // ...and if neighbouring grid isn't already in the same cluster as + // grid... + if (class1 != class2) { + // If neighbouring grid is in cluster c', merge c and c' into the + // larger of the two + if (class2 != NO_CLASS) { + // SESAME_INFO(". Cluster number is "<clusterList.at(class1).grids.size() < + this->clusterList.at(class2).grids.size()) + mergeClusters(class1, class2); + else + mergeClusters(class2, class1); + return true; + } + // If gridNeighbourhood is transitional and 'outside' of the + // cluster, assign it to cluster + else if (characteristicVec2.isTransitional(dm, dl)) { + ////TODO CHECK HERE + characteristicVec2.label = class1; + gridCluster.addGrid(gridNeighbourhood); + this->clusterList.at(class1) = gridCluster; // Testing + // putHashMap(gridList,grid,characteristicVec2); + auto it1 = gridList.find(grid); + if (it1 != gridList.end()) + it1->second = characteristicVec2; + else + gridList.insert(std::make_pair(grid, characteristicVec2)); - for (GridCluster &gridCluster : this->clusterList) - { - // // SESAME_INFO("Adjusting from cluster "<first; - bool inside = gridIter->second; - // // SESAME_INFO(" Inspecting density grid, grid, standby..."); - - // b. for each OUTSIDE grid of cluster - if (!inside) - { - // // SESAME_INFO(" Density grid dg is outside!"); - // c. for each neighbouring grid, of current iter grid - for (const DensityGrid &gridNeighbourhood : grid.getNeighbours()) - { - auto it2 = this->gridList.find(gridNeighbourhood); - if (it2 != gridList.end()) - { - auto it1 = this->gridList.find(grid); - CharacteristicVector characteristicVec1 = it1->second; - CharacteristicVector characteristicVec2 = it2->second; - int class1 = characteristicVec1.label; - int class2 = characteristicVec2.label; - // ...and if neighbouring grid isn't already in the same cluster as - // grid... - if (class1 != class2) - { - // If neighbouring grid is in cluster c', merge c and c' into the - // larger of the two - if (class2 != NO_CLASS) - { - // SESAME_INFO(". Cluster number is "<clusterList.at(class1).grids.size() < - this->clusterList.at(class2).grids.size()) - mergeClusters(class1, class2); - else - mergeClusters(class2, class1); - return true; - } - // If gridNeighbourhood is transitional and 'outside' of the - // cluster, assign it to cluster - else if (characteristicVec2.isTransitional(dm, dl)) - { - ////TODO CHECK HERE - characteristicVec2.label = class1; - gridCluster.addGrid(gridNeighbourhood); - this->clusterList.at(class1) = gridCluster; // Testing - // putHashMap(gridList,grid,characteristicVec2); - auto it1 = gridList.find(grid); - if (it1 != gridList.end()) - it1->second = characteristicVec2; - else - gridList.insert(std::make_pair(grid, characteristicVec2)); - - return true; - } - } - } - } + return true; + } } + } } + } } - return false; + } + return false; } /** - * Iterates through grid_list and updates the density for each density grid therein. - * Also marks each density grid as unvisited for this call to adjustClustering. + * Iterates through grid_list and updates the density for each density grid + * therein. Also marks each density grid as unvisited for this call to + * adjustClustering. */ -void SESAME::DStream::updateGridListDensity() -{ - // // SESAME_INFO("grid list size is "<gridList.size()); - for (auto &iter : this->gridList) - { - iter.second.isVisited = false; - iter.second.UpdateAllDensity(currentTimeStamp, param.lambda, dl, dm); - } +void SESAME::DStream::updateGridListDensity() { + // // SESAME_INFO("grid list size is "<gridList.size()); + for (auto &iter : this->gridList) { + iter.second.isVisited = false; + iter.second.UpdateAllDensity(currentTimeStamp, param.lambda, dl, dm); + } } /** @@ -341,75 +314,76 @@ void SESAME::DStream::updateGridListDensity() * * @see moa.clusterers.DStream.DStream#gap */ -void SESAME::DStream::adjustClustering() -{ - // 1. Update the density of all grids in grid_list - updateGridListDensity(); - // 2. For each grid dg whose attribute is changed since last call - // a. If dg is sparse - // b. If dg is dense - // c. If dg is transitional - while (inspectChangedGrids()) - ; +void SESAME::DStream::adjustClustering() { + // 1. Update the density of all grids in grid_list + updateGridListDensity(); + // 2. For each grid dg whose attribute is changed since last call + // a. If dg is sparse + // b. If dg is dense + // c. If dg is transitional + while (inspectChangedGrids()) + ; } /** - * Inspects each density grid in grid_list whose attribute has changed since the last - * call to adjustClustering. Implements lines 3/4/7/19 of the procedure given in Figure - * 4 of Chen and Tu 2007. + * Inspects each density grid in grid_list whose attribute has changed since the + * last call to adjustClustering. Implements lines 3/4/7/19 of the procedure + * given in Figure 4 of Chen and Tu 2007. * * @return TRUE if any grids are updated; FALSE otherwise. */ -bool SESAME::DStream::inspectChangedGrids() -{ - HashMap newGridList; - auto gridIter = this->gridList.begin(); - int a = 0; - while (gridIter != gridList.end() && newGridList.empty()) //&& newGridList.empty() - { - const DensityGrid &grid = gridIter->first; - const CharacteristicVector &characteristicVec = gridIter->second; - int gridClass = characteristicVec.label; - if (characteristicVec.attChange && !characteristicVec.isVisited) // grid.isVisited - { // grid.isVisited=true; - gridIter->second.isVisited = true; - // SESAME_INFO(a<<"th visit! Whether visited "<second.isVisited); - newGridList.insert(std::make_pair(grid, characteristicVec)); - auto it1 = gridList.find(grid); - if (it1 != gridList.end()) - it1->second = characteristicVec; - else - gridList.insert(std::make_pair(grid, characteristicVec)); - if (characteristicVec.attribute == SPARSE) - mergeGridList(newGridList, adjustForSparseGrid(grid, characteristicVec, gridClass)); - else if (characteristicVec.attribute == DENSE) - mergeGridList(newGridList, adjustForDenseGrid(grid, characteristicVec, gridClass)); - else // TRANSITIONAL - mergeGridList(newGridList, - adjustForTransitionalGrid(grid, characteristicVec, gridClass)); - } - gridIter++; - a++; +bool SESAME::DStream::inspectChangedGrids() { + HashMap newGridList; + auto gridIter = this->gridList.begin(); + int a = 0; + while (gridIter != gridList.end() && + newGridList.empty()) //&& newGridList.empty() + { + const DensityGrid &grid = gridIter->first; + const CharacteristicVector &characteristicVec = gridIter->second; + int gridClass = characteristicVec.label; + if (characteristicVec.attChange && + !characteristicVec.isVisited) // grid.isVisited + { // grid.isVisited=true; + gridIter->second.isVisited = true; + // SESAME_INFO(a<<"th visit! Whether visited + // "<second.isVisited); + newGridList.insert(std::make_pair(grid, characteristicVec)); + auto it1 = gridList.find(grid); + if (it1 != gridList.end()) + it1->second = characteristicVec; + else + gridList.insert(std::make_pair(grid, characteristicVec)); + if (characteristicVec.attribute == SPARSE) + mergeGridList(newGridList, + adjustForSparseGrid(grid, characteristicVec, gridClass)); + else if (characteristicVec.attribute == DENSE) + mergeGridList(newGridList, + adjustForDenseGrid(grid, characteristicVec, gridClass)); + else // TRANSITIONAL + mergeGridList(newGridList, adjustForTransitionalGrid( + grid, characteristicVec, gridClass)); } - // SESAME_INFO("Inspect changes in grids "<gridList, newGridList); - // SESAME_INFO("Now grid size is "<gridList, newGridList); + // SESAME_INFO("Now grid size is "< clusterList.size()) - { - SESAME_INFO("Current grids size and cluster size is " - << gridList.size() << " and " << clusterList.size() - << " adjust For Sparse Grid " << gridClass << "."); - } - - HashMap newGridList; - // System.out.print("Density grid "+dg.toString()+" is adjusted as a sparse grid at time - // "+this.getCurrTime()+". "); - if (gridClass != NO_CLASS) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == gridClass) - { ////TODO CHECK HERE - gridCluster.removeGrid(grid); - characteristicVec.label = NO_CLASS; - // newGridList.insert(std::make_pair(grid, characteristicVec)); - // newGridList=putHashMap(newGridList,grid, characteristicVec); - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - newGridList.insert(std::make_pair(grid, characteristicVec)); - if (!gridCluster.grids.empty() && !gridCluster.isConnected()) - mergeGridList(newGridList, reCluster(gridCluster)); - break; - } - } +SESAME::HashMap SESAME::DStream::adjustForSparseGrid( + DensityGrid grid, CharacteristicVector characteristicVec, int gridClass) { + if (gridClass != -1 && gridClass > clusterList.size()) { + SESAME_INFO("Current grids size and cluster size is " + << gridList.size() << " and " << clusterList.size() + << " adjust For Sparse Grid " << gridClass << "."); + } + + HashMap newGridList; + // System.out.print("Density grid "+dg.toString()+" is adjusted as a sparse + // grid at time + // "+this.getCurrTime()+". "); + if (gridClass != NO_CLASS) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == gridClass) { ////TODO CHECK HERE + gridCluster.removeGrid(grid); + characteristicVec.label = NO_CLASS; + // newGridList.insert(std::make_pair(grid, characteristicVec)); + // newGridList=putHashMap(newGridList,grid, characteristicVec); + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVec; + else + newGridList.insert(std::make_pair(grid, characteristicVec)); + if (!gridCluster.grids.empty() && !gridCluster.isConnected()) + mergeGridList(newGridList, reCluster(gridCluster)); + break; + } } - // else - // System.out.println("It was not clustered ("+dgClass+")."); - return newGridList; + } + // else + // System.out.println("It was not clustered ("+dgClass+")."); + return newGridList; } /** - * Reclusters a grid cluster into two (or more) constituent clusters when it has been identified - * that the original cluster - * is no longer a grid group. It does so by echoing the initial clustering procedure - * over only those grids in gc. + * Reclusters a grid cluster into two (or more) constituent clusters when it has + * been identified that the original cluster is no longer a grid group. It does + * so by echoing the initial clustering procedure over only those grids in gc. * @param gridCluster the grid cluster to be re clustered - * @return a HashMap containing density grids for update after - * this iteration + * @return a HashMap containing density grids + * for update after this iteration */ -SESAME::HashMap SESAME::DStream::reCluster(GridCluster gridCluster) -{ - HashMap newGridList; - auto gcIter = gridCluster.grids.begin(); - newClusterList = std::vector(); - // SESAME_INFO("ReCluster called for cluster "<first; - CharacteristicVector characteristicVecOfGrid = this->gridList.find(grid)->second; - if (characteristicVecOfGrid.attribute == DENSE) - { - int gridClass = (int)newClusterList.size(); - SESAME_INFO("grid class for the new cluster list size! " << gridClass); - ////TODO CHECK HERE - characteristicVecOfGrid.label = gridClass; - GridCluster newCluster(gridClass); - newCluster.addGrid(grid); - newClusterList.push_back(newCluster); - } - else - characteristicVecOfGrid.label = NO_CLASS; - // newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); - // newGridList=putHashMap(newGridList,grid, characteristicVecOfGrid); - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVecOfGrid; - else - newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); - gcIter++; +SESAME::HashMap SESAME::DStream::reCluster(GridCluster gridCluster) { + HashMap newGridList; + auto gcIter = gridCluster.grids.begin(); + newClusterList = std::vector(); + // SESAME_INFO("ReCluster called for cluster "<first; + CharacteristicVector characteristicVecOfGrid = + this->gridList.find(grid)->second; + if (characteristicVecOfGrid.attribute == DENSE) { + int gridClass = (int)newClusterList.size(); + SESAME_INFO("grid class for the new cluster list size! " << gridClass); + ////TODO CHECK HERE + characteristicVecOfGrid.label = gridClass; + GridCluster newCluster(gridClass); + newCluster.addGrid(grid); + newClusterList.push_back(newCluster); + } else + characteristicVecOfGrid.label = NO_CLASS; + // newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); + // newGridList=putHashMap(newGridList,grid, characteristicVecOfGrid); + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVecOfGrid; + else + newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); + gcIter++; + } + + bool changesMade; + // While changes can be made... + do { + changesMade = false; + HashMap gridListAdjusted = adjustNewLabels(newGridList); + if (!gridListAdjusted.empty()) { + SESAME_INFO("grid list is adjusted for sparse!"); + mergeGridList(newGridList, gridListAdjusted); + changesMade = true; } + } while (changesMade); - bool changesMade; - // While changes can be made... - do - { - changesMade = false; - HashMap gridListAdjusted = adjustNewLabels(newGridList); - if (!gridListAdjusted.empty()) - { - SESAME_INFO("grid list is adjusted for sparse!"); - mergeGridList(newGridList, gridListAdjusted); - changesMade = true; - } - } while (changesMade); + // Update the cluster list with the newly formed clusters + gridCluster.grids.clear(); + this->clusterList.at(gridCluster.clusterLabel) = gridCluster; + for (GridCluster &cluster : newClusterList) + this->clusterList.push_back(cluster); - // Update the cluster list with the newly formed clusters - gridCluster.grids.clear(); - this->clusterList.at(gridCluster.clusterLabel) = gridCluster; - for (GridCluster &cluster : newClusterList) this->clusterList.push_back(cluster); - - return newGridList; + return newGridList; } -SESAME::HashMap SESAME::DStream::adjustNewLabels(SESAME::HashMap newGridList) -{ - HashMap gridListAdjusted; - // a. For each cluster c - for (GridCluster &gridCluster : newClusterList) - { - for (auto &gridIter : gridCluster.grids) - { - DensityGrid grid = gridIter.first; - bool inside = gridIter.second; - - // b. for each OUTSIDE grid, dg, of c - if (!inside) - { - // c. for each neighbouring grid, neighbourGrid, of dg - - for (DensityGrid &neighbourGrid : grid.getNeighbours()) - { - if (newGridList.find(neighbourGrid) != newGridList.end()) - { - CharacteristicVector characteristicVec1 = - newGridList.find(neighbourGrid)->second; - CharacteristicVector characteristicVec2 = - newGridList.find(neighbourGrid)->second; - int class1 = characteristicVec1.label; - int class2 = characteristicVec2.label; - - // ...and if neighbourGrid isn't already in the same cluster as dg... - if (class1 != class2) - { - GridCluster cluster1 = newClusterList.at(class1); - // If dgprime is in cluster c', merge c and c' into the larger of the - // two - if (class2 != NO_CLASS) - { - GridCluster cluster2 = newClusterList.at(class2); - // System.out.println("C is "+class1+" and C' is "+class2+"."); - if (cluster1.grids.size() < cluster2.grids.size()) - mergeGridList(gridListAdjusted, - mergeNewClusters(newGridList, class1, class2)); - else - mergeGridList(gridListAdjusted, - mergeNewClusters(newGridList, class2, class1)); - - return gridListAdjusted; - } - // If neighbourGrid is transitional and outside of cluster, assign it to - // cluster - else if (characteristicVec2.isTransitional(dm, dl)) - { - ////TODO CHECK HERE - characteristicVec2.label = class1; - cluster1.addGrid(neighbourGrid); - this->newClusterList.at(class1) = cluster1; - // Change for detecting repeated grids - - auto it1 = gridListAdjusted.find(neighbourGrid); - if (it1 != gridListAdjusted.end()) - it1->second = characteristicVec2; - else - gridListAdjusted.insert( - std::make_pair(neighbourGrid, characteristicVec2)); - return gridListAdjusted; - } - } - } - } +SESAME::HashMap SESAME::DStream::adjustNewLabels(SESAME::HashMap newGridList) { + HashMap gridListAdjusted; + // a. For each cluster c + for (GridCluster &gridCluster : newClusterList) { + for (auto &gridIter : gridCluster.grids) { + DensityGrid grid = gridIter.first; + bool inside = gridIter.second; + + // b. for each OUTSIDE grid, dg, of c + if (!inside) { + // c. for each neighbouring grid, neighbourGrid, of dg + + for (DensityGrid &neighbourGrid : grid.getNeighbours()) { + if (newGridList.find(neighbourGrid) != newGridList.end()) { + CharacteristicVector characteristicVec1 = + newGridList.find(neighbourGrid)->second; + CharacteristicVector characteristicVec2 = + newGridList.find(neighbourGrid)->second; + int class1 = characteristicVec1.label; + int class2 = characteristicVec2.label; + + // ...and if neighbourGrid isn't already in the same cluster as + // dg... + if (class1 != class2) { + GridCluster cluster1 = newClusterList.at(class1); + // If dgprime is in cluster c', merge c and c' into the larger of + // the two + if (class2 != NO_CLASS) { + GridCluster cluster2 = newClusterList.at(class2); + // System.out.println("C is "+class1+" and C' is "+class2+"."); + if (cluster1.grids.size() < cluster2.grids.size()) + mergeGridList(gridListAdjusted, + mergeNewClusters(newGridList, class1, class2)); + else + mergeGridList(gridListAdjusted, + mergeNewClusters(newGridList, class2, class1)); + + return gridListAdjusted; + } + // If neighbourGrid is transitional and outside of cluster, assign + // it to cluster + else if (characteristicVec2.isTransitional(dm, dl)) { + ////TODO CHECK HERE + characteristicVec2.label = class1; + cluster1.addGrid(neighbourGrid); + this->newClusterList.at(class1) = cluster1; + // Change for detecting repeated grids + + auto it1 = gridListAdjusted.find(neighbourGrid); + if (it1 != gridListAdjusted.end()) + it1->second = characteristicVec2; + else + gridListAdjusted.insert( + std::make_pair(neighbourGrid, characteristicVec2)); + return gridListAdjusted; + } } + } } + } } - return gridListAdjusted; + } + return gridListAdjusted; } /** - * Adjusts the clustering of a dense density grid. Implements lines 8 through 18 from Figure 4 of - * Chen and Tu 2007. + * Adjusts the clustering of a dense density grid. Implements lines 8 through 18 + * from Figure 4 of Chen and Tu 2007. * * @param grid the dense density grid being adjusted * @param characteristicVec the characteristic vector of dg @@ -604,368 +561,341 @@ SESAME::HashMap SESAME::DStream::adjustNewLabels(SESAME::HashMap newGridList) * * @return a HashMap containing density grids for update after this iteration */ -SESAME::HashMap SESAME::DStream::adjustForDenseGrid(DensityGrid grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - if (gridClass != -1 && gridClass > clusterList.size()) - { - SESAME_INFO("Current grids size and cluster size is " - << gridList.size() << " and " << clusterList.size() - << " adjust For Dense Grid class is " << gridClass << "."); - } - // SESAME_INFO("Current grids size and cluster size is " - // << gridList.size()<<" and "<gridList.find(neighbourGrid) != gridList.end()) - { - hClass = this->gridList.find(neighbourGrid)->second.label; - if (hClass != NO_CLASS) - { - for (auto gridC : clusterList) - { - if (gridC.clusterLabel == hClass) - { - if (gridC.grids.size() > ChosenGridSize) - { - ChosenGridSize = gridC.grids.size(); - hChosenClass = hClass; - gridChosen = DensityGrid(neighbourGrid); - } - break; - } - } +SESAME::HashMap SESAME::DStream::adjustForDenseGrid( + DensityGrid grid, CharacteristicVector characteristicVec, int gridClass) { + if (gridClass != -1 && gridClass > clusterList.size()) { + SESAME_INFO("Current grids size and cluster size is " + << gridList.size() << " and " << clusterList.size() + << " adjust For Dense Grid class is " << gridClass << "."); + } + // SESAME_INFO("Current grids size and cluster size is " + // << gridList.size()<<" and "<gridList.find(neighbourGrid) != gridList.end()) { + hClass = this->gridList.find(neighbourGrid)->second.label; + if (hClass != NO_CLASS) { + for (auto gridC : clusterList) { + if (gridC.clusterLabel == hClass) { + if (gridC.grids.size() > ChosenGridSize) { + ChosenGridSize = gridC.grids.size(); + hChosenClass = hClass; + gridChosen = DensityGrid(neighbourGrid); } + break; + } } + } } + } - if (hChosenClass != NO_CLASS && hChosenClass != gridClass) - { - gridCluster = this->clusterList.at(hChosenClass); - - // If h is a dense grid - if (this->gridList.find(gridChosen)->second.attribute == DENSE) - { - // // SESAME_INFO("h is dense."); - // If dg is labelled as NO_CLASS - if (gridClass == NO_CLASS) - { - // // SESAME_INFO("g was labelled NO_CLASS"); - characteristicVec.label = hChosenClass; - // newGridList.insert(std::make_pair(grid, characteristicVec)); - // newGridList=putHashMap(newGridList,grid, characteristicVec); - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - newGridList.insert(std::make_pair(grid, characteristicVec)); - gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - } - // Else if dg belongs to cluster c and h belongs to c' - else - { - // SESAME_INFO("grid was labelled "<clusterList.size()); - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == gridClass) - { - double gSize = this->clusterList.at(gridClass).grids.size(); - - if (gSize <= ChosenGridSize) - mergeClusters(gridClass, hChosenClass); - else - mergeClusters(hChosenClass, gridClass); - break; - } - } - } - } + if (hChosenClass != NO_CLASS && hChosenClass != gridClass) { + gridCluster = this->clusterList.at(hChosenClass); - // Else if h is a transitional grid - else if (this->gridList.find(gridChosen)->second.attribute == TRANSITIONAL) - { - // SESAME_INFO("h is transitional."); - // If dg is labelled as no class and if h is an outside grid if dg is added to ch - if (gridClass == NO_CLASS && !gridCluster.isInside(gridChosen, grid)) - { - ////TODO CHECK HERE - characteristicVec.label = hChosenClass; - // newGridList.insert(std::make_pair(grid, characteristicVec)); - // newGridList=putHashMap(newGridList,grid, characteristicVec); - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - newGridList.insert(std::make_pair(grid, characteristicVec)); - gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - // SESAME_INFO(" dg is added to cluster "<= |ch| - else if (gridClass != NO_CLASS) - { - // SESAME_INFO("Finding this cluster......"); - for (auto c : clusterList) - { - if (c.clusterLabel == gridClass) - { - // GridCluster c = this->clusterList.at(gridClass); - int gSize = c.grids.size(); - if (gSize >= ChosenGridSize) - { - // Move h from cluster ch to cluster c - gridCluster.removeGrid(gridChosen); - c.addGrid(gridChosen); - CharacteristicVector cvhChosen = - this->gridList.find(gridChosen)->second; - ////TODO CHECK HERE - cvhChosen.label = gridClass; - // newGridList.insert(std::make_pair(gridChosen, cvhChosen)); - // newGridList=putHashMap(newGridList,gridChosen, cvhChosen); - - auto it1 = newGridList.find(gridChosen); - if (it1 != newGridList.end()) - it1->second = cvhChosen; - else - newGridList.insert(std::make_pair(gridChosen, cvhChosen)); - // SESAME_INFO("dgClass is "<clusterList.at(hChosenClass) = gridCluster; - this->clusterList.at(gridClass) = c; - } - } - } - } + // If h is a dense grid + if (this->gridList.find(gridChosen)->second.attribute == DENSE) { + // // SESAME_INFO("h is dense."); + // If dg is labelled as NO_CLASS + if (gridClass == NO_CLASS) { + // // SESAME_INFO("g was labelled NO_CLASS"); + characteristicVec.label = hChosenClass; + // newGridList.insert(std::make_pair(grid, characteristicVec)); + // newGridList=putHashMap(newGridList,grid, characteristicVec); + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVec; + else + newGridList.insert(std::make_pair(grid, characteristicVec)); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; + } + // Else if dg belongs to cluster c and h belongs to c' + else { + // SESAME_INFO("grid was labelled "<clusterList.size()); + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == gridClass) { + double gSize = this->clusterList.at(gridClass).grids.size(); + + if (gSize <= ChosenGridSize) + mergeClusters(gridClass, hChosenClass); + else + mergeClusters(hChosenClass, gridClass); + break; + } } + } } - // If dgClass is dense and not in a cluster, and none if its neighbours are in a cluster, - // put it in its own new cluster and search the neighbourhood for transitional or dense - // grids to add - else if (gridClass == NO_CLASS) - { - int newClass = (int)this->clusterList.size(); - GridCluster c = GridCluster(newClass); - c.addGrid(grid); - // System.out.println("Added "+dg.toString()+" to cluster "+newClass+"."); - this->clusterList.push_back(c); + + // Else if h is a transitional grid + else if (this->gridList.find(gridChosen)->second.attribute == + TRANSITIONAL) { + // SESAME_INFO("h is transitional."); + // If dg is labelled as no class and if h is an outside grid if dg is + // added to ch + if (gridClass == NO_CLASS && !gridCluster.isInside(gridChosen, grid)) { ////TODO CHECK HERE - characteristicVec.label = newClass; - // newGridList= putHashMap(newGridList,grid, characteristicVec); - auto it1 = newGridList.find(gridChosen); + characteristicVec.label = hChosenClass; + // newGridList.insert(std::make_pair(grid, characteristicVec)); + // newGridList=putHashMap(newGridList,grid, characteristicVec); + auto it1 = newGridList.find(grid); if (it1 != newGridList.end()) - it1->second = characteristicVec; + it1->second = characteristicVec; else - newGridList.insert(std::make_pair(gridChosen, characteristicVec)); - // Iterate through the neighbourhood until no more transitional neighbours can be added - // (dense neighbours will add themselves as part of their adjust process) - for (DensityGrid &dghprime : grid.getNeighbours()) - { - if (this->gridList.find(dghprime) != this->gridList.end() && - c.grids.find(dghprime) != c.grids.end()) - { - CharacteristicVector cvhprime = this->gridList.find(dghprime)->second; - if (cvhprime.attribute == TRANSITIONAL) - { - c.addGrid(dghprime); - ////TODO CHECK HERE - cvhprime.label = newClass; - // newGridList.insert(std::make_pair(dghprime, cvhprime)); - // newGridList=putHashMap(newGridList,dghprime, cvhprime); - auto it1 = newGridList.find(dghprime); - if (it1 != newGridList.end()) - it1->second = cvhprime; - else - newGridList.insert(std::make_pair(dghprime, cvhprime)); - } + newGridList.insert(std::make_pair(grid, characteristicVec)); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; + // SESAME_INFO(" dg is added to cluster "<= |ch| + else if (gridClass != NO_CLASS) { + // SESAME_INFO("Finding this cluster......"); + for (auto c : clusterList) { + if (c.clusterLabel == gridClass) { + // GridCluster c = this->clusterList.at(gridClass); + int gSize = c.grids.size(); + if (gSize >= ChosenGridSize) { + // Move h from cluster ch to cluster c + gridCluster.removeGrid(gridChosen); + c.addGrid(gridChosen); + CharacteristicVector cvhChosen = + this->gridList.find(gridChosen)->second; + ////TODO CHECK HERE + cvhChosen.label = gridClass; + // newGridList.insert(std::make_pair(gridChosen, cvhChosen)); + // newGridList=putHashMap(newGridList,gridChosen, cvhChosen); + + auto it1 = newGridList.find(gridChosen); + if (it1 != newGridList.end()) + it1->second = cvhChosen; + else + newGridList.insert(std::make_pair(gridChosen, cvhChosen)); + // SESAME_INFO("dgClass is "<clusterList.at(hChosenClass) = gridCluster; + this->clusterList.at(gridClass) = c; } + } + } + } + } + } + // If dgClass is dense and not in a cluster, and none if its neighbours are in + // a cluster, put it in its own new cluster and search the neighbourhood for + // transitional or dense grids to add + else if (gridClass == NO_CLASS) { + int newClass = (int)this->clusterList.size(); + GridCluster c = GridCluster(newClass); + c.addGrid(grid); + // System.out.println("Added "+dg.toString()+" to cluster "+newClass+"."); + this->clusterList.push_back(c); + ////TODO CHECK HERE + characteristicVec.label = newClass; + // newGridList= putHashMap(newGridList,grid, characteristicVec); + auto it1 = newGridList.find(gridChosen); + if (it1 != newGridList.end()) + it1->second = characteristicVec; + else + newGridList.insert(std::make_pair(gridChosen, characteristicVec)); + // Iterate through the neighbourhood until no more transitional neighbours + // can be added (dense neighbours will add themselves as part of their + // adjust process) + for (DensityGrid &dghprime : grid.getNeighbours()) { + if (this->gridList.find(dghprime) != this->gridList.end() && + c.grids.find(dghprime) != c.grids.end()) { + CharacteristicVector cvhprime = this->gridList.find(dghprime)->second; + if (cvhprime.attribute == TRANSITIONAL) { + c.addGrid(dghprime); + ////TODO CHECK HERE + cvhprime.label = newClass; + // newGridList.insert(std::make_pair(dghprime, cvhprime)); + // newGridList=putHashMap(newGridList,dghprime, cvhprime); + auto it1 = newGridList.find(dghprime); + if (it1 != newGridList.end()) + it1->second = cvhprime; + else + newGridList.insert(std::make_pair(dghprime, cvhprime)); } - this->clusterList.at(newClass) = c; + } } + this->clusterList.at(newClass) = c; + } - return newGridList; + return newGridList; } /** - * Adjusts the clustering of a transitional density grid. Implements lines 20 and 21 from Figure 4 - * of Chen and Tu 2007. + * Adjusts the clustering of a transitional density grid. Implements lines 20 + * and 21 from Figure 4 of Chen and Tu 2007. * * @param dg the dense density grid being adjusted * @param cv the characteristic vector of dg * @param dgClass the cluster to which dg belonged * - * @return a HashMap containing density grids for update after - * this iteration + * @return a HashMap containing density grids + * for update after this iteration */ -SESAME::HashMap SESAME::DStream::adjustForTransitionalGrid(DensityGrid grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - // Among all neighbours of dg, find the grid h whose cluster ch has the largest size - // and satisfies that dg would be an outside grid if added to it - GridCluster gridCluster; // The cluster, ch, of h - double hChosenSize = 0.0; // The size of ch, the largest cluster - DensityGrid neighbourGrid; // The neighbour of dg being considered - int hClass = NO_CLASS; // The class label of h - int hChosenClass = NO_CLASS; // The class label of ch - HashMap newGridList; - if (gridClass != -1 && gridClass > clusterList.size()) - { - SESAME_INFO("Current grids size and cluster size is " - << gridList.size() << " and " << clusterList.size() - << " adjust For Transitional Grid " << gridClass << "."); - } - - for (DensityGrid &neighbourGrid : grid.getNeighbours()) - { - if (this->gridList.find(neighbourGrid) != gridList.end()) - { - hClass = this->gridList.find(neighbourGrid)->second.label; - ; - if (hClass != NO_CLASS) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hClass) - { - if ((gridCluster.grids.size() > hChosenSize) && - !gridCluster.isInside(grid, grid)) - { - hChosenSize = gridCluster.grids.size(); - hChosenClass = hClass; - } - } - } +SESAME::HashMap SESAME::DStream::adjustForTransitionalGrid( + DensityGrid grid, CharacteristicVector characteristicVec, int gridClass) { + // Among all neighbours of dg, find the grid h whose cluster ch has the + // largest size and satisfies that dg would be an outside grid if added to it + GridCluster gridCluster; // The cluster, ch, of h + double hChosenSize = 0.0; // The size of ch, the largest cluster + DensityGrid neighbourGrid; // The neighbour of dg being considered + int hClass = NO_CLASS; // The class label of h + int hChosenClass = NO_CLASS; // The class label of ch + HashMap newGridList; + if (gridClass != -1 && gridClass > clusterList.size()) { + SESAME_INFO("Current grids size and cluster size is " + << gridList.size() << " and " << clusterList.size() + << " adjust For Transitional Grid " << gridClass << "."); + } + + for (DensityGrid &neighbourGrid : grid.getNeighbours()) { + if (this->gridList.find(neighbourGrid) != gridList.end()) { + hClass = this->gridList.find(neighbourGrid)->second.label; + ; + if (hClass != NO_CLASS) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hClass) { + if ((gridCluster.grids.size() > hChosenSize) && + !gridCluster.isInside(grid, grid)) { + hChosenSize = gridCluster.grids.size(); + hChosenClass = hClass; } + } } + } + } + } + + // System.out.println(" Chosen neighbour is from cluster "+hChosenClass+", + // dgClass is + // "+dgClass+"."); + + if (hChosenClass != NO_CLASS && hChosenClass != gridClass) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hChosenClass) { + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; + } } - // System.out.println(" Chosen neighbour is from cluster "+hChosenClass+", dgClass is - // "+dgClass+"."); - - if (hChosenClass != NO_CLASS && hChosenClass != gridClass) + if (gridClass != NO_CLASS) { + GridCluster cluster = this->clusterList.at(gridClass); + cluster.removeGrid(grid); + this->clusterList.at(gridClass) = cluster; + } + ////TODO CHECK HERE + characteristicVec.label = hChosenClass; + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVec; + else + // newGridList.insert(std::make_pair(grid, characteristicVec)); + // newGridList= putHashMap(newGridList,grid, characteristicVec); { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hChosenClass) - { - gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - } - } - - if (gridClass != NO_CLASS) - { - GridCluster cluster = this->clusterList.at(gridClass); - cluster.removeGrid(grid); - this->clusterList.at(gridClass) = cluster; - } - ////TODO CHECK HERE - characteristicVec.label = hChosenClass; - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - // newGridList.insert(std::make_pair(grid, characteristicVec)); - // newGridList= putHashMap(newGridList,grid, characteristicVec); - { - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - newGridList.insert(std::make_pair(grid, characteristicVec)); - } + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVec; + else + newGridList.insert(std::make_pair(grid, characteristicVec)); } + } - return newGridList; + return newGridList; } -SESAME::HashMap SESAME::DStream::mergeNewClusters(SESAME::HashMap newGridList, int smallCluster, - int bigCluster) -{ - // System.out.println("Merge new clusters "+smallCluster+" and "+bigCluster+"."); - // Iterate through the density grids in grid_list to find those which are in highClass - for (HashMap::iterator gridIter = newGridList.begin(); gridIter != gridList.end(); gridIter++) - { - DensityGrid grid = gridIter->first; - CharacteristicVector characteristicVec = gridIter->second; - - // Assign density grids in small Cluster to bigCluster - if (characteristicVec.label == smallCluster) - { - ////TODO CHECK HERE - characteristicVec.label = bigCluster; - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - // newGridList.insert(std::make_pair(grid, characteristicVec)); - // newGridList= putHashMap(newGridList,grid, characteristicVec); - newGridList.insert(std::make_pair(grid, characteristicVec)); - } +SESAME::HashMap SESAME::DStream::mergeNewClusters(SESAME::HashMap newGridList, + int smallCluster, + int bigCluster) { + // System.out.println("Merge new clusters "+smallCluster+" and + // "+bigCluster+"."); Iterate through the density grids in grid_list to find + // those which are in highClass + for (HashMap::iterator gridIter = newGridList.begin(); + gridIter != gridList.end(); gridIter++) { + DensityGrid grid = gridIter->first; + CharacteristicVector characteristicVec = gridIter->second; + + // Assign density grids in small Cluster to bigCluster + if (characteristicVec.label == smallCluster) { + ////TODO CHECK HERE + characteristicVec.label = bigCluster; + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVec; + else + // newGridList.insert(std::make_pair(grid, characteristicVec)); + // newGridList= putHashMap(newGridList,grid, characteristicVec); + newGridList.insert(std::make_pair(grid, characteristicVec)); } - // SESAME_INFO("Density grids assigned to cluster "<newClusterList.at(bigCluster); - bGC.absorbCluster(this->newClusterList.at(smallCluster)); - this->newClusterList.at(bigCluster) = bGC; - this->newClusterList.erase(this->newClusterList.begin() + smallCluster); - // System.out.println("Cluster "+smallClus+" removed from list."); - newGridList = cleanNewClusters(newGridList); - - return newGridList; + } + // SESAME_INFO("Density grids assigned to cluster "<newClusterList.at(bigCluster); + bGC.absorbCluster(this->newClusterList.at(smallCluster)); + this->newClusterList.at(bigCluster) = bGC; + this->newClusterList.erase(this->newClusterList.begin() + smallCluster); + // System.out.println("Cluster "+smallClus+" removed from list."); + newGridList = cleanNewClusters(newGridList); + + return newGridList; } /** - * Determines whether a sparse density grid is sporadic using rules S1 and S2 of Chen and Tu 2007 + * Determines whether a sparse density grid is sporadic using rules S1 and S2 of + * Chen and Tu 2007 * - * @param characteristicVec - the CharacteristicVector of the density grid being assessed for - * sporadicity + * @param characteristicVec - the CharacteristicVector of the density grid being + * assessed for sporadicity */ -bool SESAME::DStream::checkIfSporadic(CharacteristicVector characteristicVec) -{ - // Check S1 - if (characteristicVec.getCurrGridDensity(currentTimeStamp, param.lambda) < - outlier_density_thresholdFunction(characteristicVec.densityUpdateTime, param.cl, - param.lambda, this->NGrids)) - { - // Check S2 TODO CHANGE REMOVE TIME FROM 0 TO -1 - if (characteristicVec.removeTime == 0 || - (currentTimeStamp - ((1 + param.beta) * characteristicVec.removeTime)) >= 0) - return true; - } - - return false; +bool SESAME::DStream::checkIfSporadic(CharacteristicVector characteristicVec) { + // Check S1 + if (characteristicVec.getCurrGridDensity(currentTimeStamp, param.lambda) < + outlier_density_thresholdFunction(characteristicVec.densityUpdateTime, + param.cl, param.lambda, this->NGrids)) { + // Check S2 TODO CHANGE REMOVE TIME FROM 0 TO -1 + if (characteristicVec.removeTime == 0 || + (currentTimeStamp - + ((1 + param.beta) * characteristicVec.removeTime)) >= 0) + return true; + } + + return false; } /** * Implements the function pi given in Definition 4.1 of Chen and Tu 2007 * * @param tg - the update time in the density grid's characteristic vector - * @param cl - user defined parameter which controls the threshold for sparse grids + * @param cl - user defined parameter which controls the threshold for sparse + * grids * @param lambda - see lambda definition * @param NGrids - the number of density grids, */ -double SESAME::DStream::outlier_density_thresholdFunction(int tg, double cl, double lambda, - int NGrids) -{ - return (cl * (1.0 - pow(lambda, ((double)(currentTimeStamp - tg) + 1.0)))) / - (NGrids * (1.0 - lambda)); +double SESAME::DStream::outlier_density_thresholdFunction(int tg, double cl, + double lambda, + int NGrids) { + return (cl * (1.0 - pow(lambda, ((double)(currentTimeStamp - tg) + 1.0)))) / + (NGrids * (1.0 - lambda)); } /** @@ -975,148 +905,137 @@ double SESAME::DStream::outlier_density_thresholdFunction(int tg, double cl, dou * @param smallCluster - the index of the smaller cluster * @param bigCluster - the index of the bigger cluster */ -void SESAME::DStream::mergeClusters(int smallCluster, int bigCluster) -{ - // SESAME_INFO("Merge clusters "<first; - CharacteristicVector characteristicVec = gridIter->second; - - // Assign density grids in smallCluster to bigCluster - if (characteristicVec.label == smallCluster) - { - ////TODO CHECK HERE - characteristicVec.label = bigCluster; - gridIter->second = characteristicVec; - } +void SESAME::DStream::mergeClusters(int smallCluster, int bigCluster) { + // SESAME_INFO("Merge clusters "<first; + CharacteristicVector characteristicVec = gridIter->second; + + // Assign density grids in smallCluster to bigCluster + if (characteristicVec.label == smallCluster) { + ////TODO CHECK HERE + characteristicVec.label = bigCluster; + gridIter->second = characteristicVec; } - // SESAME_INFO("Density grids assigned to cluster "<clusterList.at(bigCluster); - bigGridCluster.absorbCluster(this->clusterList.at(smallCluster)); - this->clusterList.at(bigCluster) = bigGridCluster; - this->clusterList.erase(clusterList.begin() + smallCluster); - // SESAME_INFO("Cluster "<clusterList.at(bigCluster); + bigGridCluster.absorbCluster(this->clusterList.at(smallCluster)); + this->clusterList.at(bigCluster) = bigGridCluster; + this->clusterList.erase(clusterList.begin() + smallCluster); + // SESAME_INFO("Cluster "<0) SESAME_INFO("Merge grid list! size this and other is "<< - // thisGridList.size()<<" "<0) SESAME_INFO("Merge grid list! size this and other is + // "<< thisGridList.size()<<" "< toRemove; - // Check to see if there are any empty clusters - for (auto &cluster : this->newClusterList) - { - if (cluster.grids.empty()) toRemove.push_back(cluster); +SESAME::HashMap SESAME::DStream::cleanNewClusters(SESAME::HashMap newGridList) { + std::vector toRemove; + // Check to see if there are any empty clusters + for (auto &cluster : this->newClusterList) { + if (cluster.grids.empty()) + toRemove.push_back(cluster); + } + + // Remove empty clusters + if (!toRemove.empty()) { + for (auto &RemoveCluster : toRemove) { + auto removeCIter = std::find(newClusterList.begin(), newClusterList.end(), + RemoveCluster); + if (std::find(newClusterList.begin(), newClusterList.end(), + RemoveCluster) != newClusterList.end()) + this->newClusterList.erase(removeCIter); } - - // Remove empty clusters - if (!toRemove.empty()) - { - for (auto &RemoveCluster : toRemove) - { - auto removeCIter = - std::find(newClusterList.begin(), newClusterList.end(), RemoveCluster); - if (std::find(newClusterList.begin(), newClusterList.end(), RemoveCluster) != - newClusterList.end()) - this->newClusterList.erase(removeCIter); - } + } + for (auto &cluster : this->newClusterList) { + auto clusterIter = + std::find(newClusterList.begin(), newClusterList.end(), cluster); + int index = (int)std::distance(newClusterList.begin(), clusterIter); + ////TODO CHECK HERE + cluster.clusterLabel = index; + unordered_map removeGrids; + for (auto &gridOfCluster : cluster.grids) { + DensityGrid grid = gridOfCluster.first; + ; + if (newGridList.find(grid) != newGridList.end()) + newGridList.find(grid)->second.label = index; + else + removeGrids.insert(gridOfCluster); } - for (auto &cluster : this->newClusterList) - { - auto clusterIter = std::find(newClusterList.begin(), newClusterList.end(), cluster); - int index = (int)std::distance(newClusterList.begin(), clusterIter); - ////TODO CHECK HERE - cluster.clusterLabel = index; - unordered_map removeGrids; - for (auto &gridOfCluster : cluster.grids) - { - DensityGrid grid = gridOfCluster.first; - ; - if (newGridList.find(grid) != newGridList.end()) - newGridList.find(grid)->second.label = index; - else - removeGrids.insert(gridOfCluster); - } - for (auto &grid : removeGrids) - { - if (cluster.grids.find(grid.first) != cluster.grids.end()) - cluster.grids.erase(grid.first); - } + for (auto &grid : removeGrids) { + if (cluster.grids.find(grid.first) != cluster.grids.end()) + cluster.grids.erase(grid.first); } - // SESAME_INFO("Clean finish!"); - return newGridList; + } + // SESAME_INFO("Clean finish!"); + return newGridList; } // TODO stop right here and confused about get() /** - * Iterates through cluster_list to ensure that all empty clusters have been removed and - * that all cluster IDs match the cluster's index in cluster_list. + * Iterates through cluster_list to ensure that all empty clusters have been + * removed and that all cluster IDs match the cluster's index in cluster_list. */ -void SESAME::DStream::cleanClusters() -{ - // SESAME_INFO("Clean Clusters"); - - std::vector toRemove; - - // Check to see if there are any empty clusters - for (auto &cluster : this->clusterList) - { - if (cluster.grids.empty()) toRemove.push_back(cluster); - } - // Remove empty clusters - if (!toRemove.empty()) - { - for (auto &RemoveCluster : toRemove) - { - auto removeCIter = std::find(clusterList.begin(), clusterList.end(), RemoveCluster); - if (std::find(clusterList.begin(), clusterList.end(), RemoveCluster) != - clusterList.end()) - this->clusterList.erase(removeCIter); - } +void SESAME::DStream::cleanClusters() { + // SESAME_INFO("Clean Clusters"); + + std::vector toRemove; + + // Check to see if there are any empty clusters + for (auto &cluster : this->clusterList) { + if (cluster.grids.empty()) + toRemove.push_back(cluster); + } + // Remove empty clusters + if (!toRemove.empty()) { + for (auto &RemoveCluster : toRemove) { + auto removeCIter = + std::find(clusterList.begin(), clusterList.end(), RemoveCluster); + if (std::find(clusterList.begin(), clusterList.end(), RemoveCluster) != + clusterList.end()) + this->clusterList.erase(removeCIter); } - // Adjust remaining clusters as necessary, index = label = order - for (auto &cluster : this->clusterList) - { - auto clusterIter = std::find(clusterList.begin(), clusterList.end(), cluster); - int index = (int)std::distance(clusterList.begin(), clusterIter); + } + // Adjust remaining clusters as necessary, index = label = order + for (auto &cluster : this->clusterList) { + auto clusterIter = + std::find(clusterList.begin(), clusterList.end(), cluster); + int index = (int)std::distance(clusterList.begin(), clusterIter); + ////TODO CHECK HERE + cluster.clusterLabel = index; + unordered_map removeGrids; + for (auto &gridOfCluster : cluster.grids) { + DensityGrid grid = gridOfCluster.first; + if (gridList.find(grid) != gridList.end()) ////TODO CHECK HERE - cluster.clusterLabel = index; - unordered_map removeGrids; - for (auto &gridOfCluster : cluster.grids) - { - DensityGrid grid = gridOfCluster.first; - if (gridList.find(grid) != gridList.end()) - ////TODO CHECK HERE - gridList.find(grid)->second.label = index; - else - removeGrids.insert(gridOfCluster); - } - for (auto &grid : removeGrids) - { - if (cluster.grids.find(grid.first) != cluster.grids.end()) - cluster.grids.erase(grid.first); - } - this->clusterList.at(index) = cluster; + gridList.find(grid)->second.label = index; + else + removeGrids.insert(gridOfCluster); } + for (auto &grid : removeGrids) { + if (cluster.grids.find(grid.first) != cluster.grids.end()) + cluster.grids.erase(grid.first); + } + this->clusterList.at(index) = cluster; + } } /** @@ -1129,90 +1048,78 @@ void SESAME::DStream::cleanClusters() b. Else i. If (S1 && S2), mark as sporadic */ -void SESAME::DStream::removeSporadic() -{ - // SESAME_INFO("REMOVE SPORADIC CALLED"); - // For each grid g in grid_list - - HashMap newGridList; - std::vector removeGridList; - for (auto &gridIter : this->gridList) - { - const DensityGrid &grid = gridIter.first; - CharacteristicVector characteristicVec = gridIter.second; - // If g is sporadic - if (characteristicVec.isSporadic) - { - // If currTime - tg > gap, delete g from grid_list - if (currentTimeStamp - characteristicVec.updateTime >= gap) - { - int gridClass = characteristicVec.label; - - if (gridClass != -1) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == gridClass) - { - gridCluster.removeGrid(grid); - } - } - } - removeGridList.push_back(grid); - } - // Else if (S1 && S2), mark as sporadic - Else mark as normal - else - { - characteristicVec.isSporadic = checkIfSporadic(characteristicVec); - // newGridList.insert(std::make_pair(grid, characteristicVec)); - // newGridList= putHashMap(newGridList,grid, characteristicVec); - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - newGridList.insert(std::make_pair(grid, characteristicVec)); +void SESAME::DStream::removeSporadic() { + // SESAME_INFO("REMOVE SPORADIC CALLED"); + // For each grid g in grid_list + + HashMap newGridList; + std::vector removeGridList; + for (auto &gridIter : this->gridList) { + const DensityGrid &grid = gridIter.first; + CharacteristicVector characteristicVec = gridIter.second; + // If g is sporadic + if (characteristicVec.isSporadic) { + // If currTime - tg > gap, delete g from grid_list + if (currentTimeStamp - characteristicVec.updateTime >= gap) { + int gridClass = characteristicVec.label; + + if (gridClass != -1) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == gridClass) { + gridCluster.removeGrid(grid); } + } } - // Else if (S1 && S2), mark as sporadic + removeGridList.push_back(grid); + } + // Else if (S1 && S2), mark as sporadic - Else mark as normal + else { + characteristicVec.isSporadic = checkIfSporadic(characteristicVec); + // newGridList.insert(std::make_pair(grid, characteristicVec)); + // newGridList= putHashMap(newGridList,grid, characteristicVec); + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVec; else - { - characteristicVec.isSporadic = checkIfSporadic(characteristicVec); - // newGridList.insert(std::make_pair(grid, characteristicVec)); - // newGridList= putHashMap(newGridList,grid, characteristicVec); - auto it1 = newGridList.find(grid); - if (it1 != newGridList.end()) - it1->second = characteristicVec; - else - newGridList.insert(std::make_pair(grid, characteristicVec)); - } + newGridList.insert(std::make_pair(grid, characteristicVec)); + } } - mergeGridList(gridList, newGridList); - - // SESAME_INFO(" - Removed "<deletedGrids.insert(std::make_pair(sporadicGrid, currentTimeStamp)); - this->gridList.erase(sporadicGrid); - for (auto &cluster : this->clusterList) - { - if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) - { - cluster.grids.erase(sporadicGrid); - } - } - for (auto &cluster : this->newClusterList) - { - if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) - { - cluster.grids.erase(sporadicGrid); - break; - } - } + // Else if (S1 && S2), mark as sporadic + else { + characteristicVec.isSporadic = checkIfSporadic(characteristicVec); + // newGridList.insert(std::make_pair(grid, characteristicVec)); + // newGridList= putHashMap(newGridList,grid, characteristicVec); + auto it1 = newGridList.find(grid); + if (it1 != newGridList.end()) + it1->second = characteristicVec; + else + newGridList.insert(std::make_pair(grid, characteristicVec)); + } + } + mergeGridList(gridList, newGridList); + + // SESAME_INFO(" - Removed "<deletedGrids.insert(std::make_pair(sporadicGrid, currentTimeStamp)); + this->gridList.erase(sporadicGrid); + for (auto &cluster : this->clusterList) { + if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) { + cluster.grids.erase(sporadicGrid); + } + } + for (auto &cluster : this->newClusterList) { + if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) { + cluster.grids.erase(sporadicGrid); + break; + } } + } } /* -HashMap SESAME::DStream::putHashMap(HashMap gList, const DensityGrid& g, CharacteristicVector cv) +HashMap SESAME::DStream::putHashMap(HashMap gList, const DensityGrid& g, +CharacteristicVector cv) { auto it1 = gList.find(g); if (it1 != gList.end()) diff --git a/src/Algorithm/DataStructure/CFTree.cpp b/src/Algorithm/DataStructure/CFTree.cpp index 7e6f8d85..facf33ed 100644 --- a/src/Algorithm/DataStructure/CFTree.cpp +++ b/src/Algorithm/DataStructure/CFTree.cpp @@ -14,17 +14,13 @@ using namespace std::experimental; -namespace SESAME -{ +namespace SESAME { CFTree::CFTree(const SesameParam ¶m) - : max_in_nodes(param.max_in_nodes), - max_leaf_nodes(param.max_leaf_nodes), - distance_threshold(param.distance_threshold) -{} + : max_in_nodes(param.max_in_nodes), max_leaf_nodes(param.max_leaf_nodes), + distance_threshold(param.distance_threshold) {} CFTree::CFTree(int b = 0, int l = 0, double t = 0.0) - : max_in_nodes(b), max_leaf_nodes(l), distance_threshold(t) -{} + : max_in_nodes(b), max_leaf_nodes(l), distance_threshold(t) {} CFTree::~CFTree() {} int CFTree::getB() const { return this->max_in_nodes; } @@ -43,50 +39,44 @@ void CFNode::setNode(CFPtr &Node) { this->curCF = Node; } void CFNode::setParent(NodePtr &Parent) { this->parent = Parent; } void CFNode::setIndex(int Index) { this->index = Index; } void CFNode::setChild(NodePtr &child) { this->children.push_back(child); } -void CFNode::setChildren(std::vector children) { this->children = children; } -CFNode::CFNode() -{ - this->curCF = std::make_shared(); - this->isLeaf = true; +void CFNode::setChildren(std::vector children) { + this->children = children; +} +CFNode::CFNode() { + this->curCF = std::make_shared(); + this->isLeaf = true; } CFNode::~CFNode() {} bool CFNode::getIsLeaf() { return this->isLeaf; } void CFNode::setIsLeaf(bool leaf) { this->isLeaf = leaf; } NodePtr CFNode::copy() { return std::make_shared(*this); } -void CFNode::setCF(CFPtr &cf) -{ - this->curCF->setN(cf->getN()); - std::vector ls = cf->getLS(); - std::vector ss = cf->getSS(); - this->curCF->setLS(ls); - this->curCF->setSS(ss); +void CFNode::setCF(CFPtr &cf) { + this->curCF->setN(cf->getN()); + std::vector ls = cf->getLS(); + std::vector ss = cf->getSS(); + this->curCF->setLS(ls); + this->curCF->setSS(ss); } // when need to clear the parent node, directly set the index = -1 void CFNode::clearParents() { this->parent->setIndex(-1); } -void CFNode::removeChild(NodePtr &child) -{ - for (int i = 0; i < this->children.size(); i++) - { - if (this->children[i]->getIndex() == child->getIndex()) - { - auto childCF = this->children[i]->getCF(); - auto removeCF = child->getCF(); - bool flag = true; - for (int j = 0; j < childCF->getLS().size(); j++) - { - if (childCF->getLS()[j] != removeCF->getLS()[j] or - childCF->getSS()[j] != removeCF->getSS()[j]) - { - flag = false; - break; - } - } - if (flag) - { - this->children.erase(this->children.begin() + i); - } +void CFNode::removeChild(NodePtr &child) { + for (int i = 0; i < this->children.size(); i++) { + if (this->children[i]->getIndex() == child->getIndex()) { + auto childCF = this->children[i]->getCF(); + auto removeCF = child->getCF(); + bool flag = true; + for (int j = 0; j < childCF->getLS().size(); j++) { + if (childCF->getLS()[j] != removeCF->getLS()[j] or + childCF->getSS()[j] != removeCF->getSS()[j]) { + flag = false; + break; } + } + if (flag) { + this->children.erase(this->children.begin() + i); + } } + } } bool SESAME::CFNode::getOutlier() { return this->outlier; } void SESAME::CFNode::setOutlier(bool flag) { this->outlier = flag; } @@ -94,366 +84,310 @@ void SESAME::CFNode::setOutlier(bool flag) { this->outlier = flag; } ClusteringFeaturesTree::~ClusteringFeaturesTree() {} ClusteringFeaturesTree::ClusteringFeaturesTree(const SesameParam ¶m) - : dim(param.dim), - max_in_nodes(param.max_in_nodes), + : dim(param.dim), max_in_nodes(param.max_in_nodes), max_leaf_nodes(param.max_leaf_nodes), - distance_threshold(param.distance_threshold) -{ - root_ = GenericFactory::New(nullptr, param.dim); - root_->index = leafMask++; + distance_threshold(param.distance_threshold) { + root_ = GenericFactory::New(nullptr, param.dim); + root_->index = leafMask++; } -ClusteringFeaturesTree::NodePtr ClusteringFeaturesTree::Insert(PointPtr point) -{ - auto curNode = root_; - if (curNode->cf.num == 0) - { - curNode->Update(point, true); - clusters_.push_back(curNode); - } - else - { - while (1) - { - if (curNode->IsLeaf()) - { - auto centroid = curNode->Centroid(); - if (point->L2Dist(centroid) <= distance_threshold) - { // concept drift detection - // whether the new radius is lower than threshold T - curNode->Update(point, true); - // means this point could get included in this cluster - // SESAME_DEBUG("No concept drift occurs(t <= T), insert tha point - // into the leaf node..."); - break; - // Normally insert the data point into the tree leafNode without - // concept drift - } - else - { - // concept drift adaption - // SESAME_DEBUG("Concept drift occurs(t > T), the current leaf node - // capacity reaches the threshold T"); - return backwardEvolution(curNode, point); - } - } - else - { - curNode = CalcClosestNode(curNode->children, point).first; - } +ClusteringFeaturesTree::NodePtr ClusteringFeaturesTree::Insert(PointPtr point) { + auto curNode = root_; + if (curNode->cf.num == 0) { + curNode->Update(point, true); + clusters_.push_back(curNode); + } else { + while (1) { + if (curNode->IsLeaf()) { + auto centroid = curNode->Centroid(); + if (point->L2Dist(centroid) <= + distance_threshold) { // concept drift detection + // whether the new radius is lower than threshold T + curNode->Update(point, true); + // means this point could get included in this cluster + // SESAME_DEBUG("No concept drift occurs(t <= T), insert tha point + // into the leaf node..."); + break; + // Normally insert the data point into the tree leafNode without + // concept drift + } else { + // concept drift adaption + // SESAME_DEBUG("Concept drift occurs(t > T), the current leaf node + // capacity reaches the threshold T"); + return backwardEvolution(curNode, point); } + } else { + curNode = CalcClosestNode(curNode->children, point).first; + } } - return curNode; + } + return curNode; } -ClusteringFeaturesTree::NodePtr ClusteringFeaturesTree::Insert(NodePtr node) -{ - auto curNode = root_; - auto center = node->Centroid(); - while (1) - { - auto childrenNode = curNode->children; - if (curNode->IsLeaf()) - { - // timerMeter.clusterUpdateAccMeasure(); - // timerMeter.dataInsertAccMeasure(); - auto centroid = curNode->Centroid(); - // timerMeter.dataInsertEndMeasure(); - if (center->L2Dist(centroid) <= distance_threshold) - { // concept drift detection - // whether the new radius is lower than threshold T - // timerMeter.dataInsertAccMeasure(); - curNode->Update(node, true); +ClusteringFeaturesTree::NodePtr ClusteringFeaturesTree::Insert(NodePtr node) { + auto curNode = root_; + auto center = node->Centroid(); + while (1) { + auto childrenNode = curNode->children; + if (curNode->IsLeaf()) { + // timerMeter.clusterUpdateAccMeasure(); + // timerMeter.dataInsertAccMeasure(); + auto centroid = curNode->Centroid(); + // timerMeter.dataInsertEndMeasure(); + if (center->L2Dist(centroid) <= + distance_threshold) { // concept drift detection + // whether the new radius is lower than threshold T + // timerMeter.dataInsertAccMeasure(); + curNode->Update(node, true); - // means this point could get included in this cluster - // SESAME_DEBUG("No concept drift occurs(t <= T), insert tha point - // into the leaf node..."); - break; - // Normally insert the data point into the tree leafNode without - // concept drift - } - else - { - // concept drift adaption - // SESAME_DEBUG("Concept drift occurs(t > T), the current leaf node - // capacity reaches the threshold T"); - // timerMeter.clusterUpdateAccMeasure(); - return backwardEvolution(curNode, node); - // timerMeter.clusterUpdateEndMeasure(); - // break; - } - } - else - { - // timerMeter.dataInsertAccMeasure(); - curNode = CalcClosestNode(childrenNode, center).first; - // timerMeter.dataInsertEndMeasure(); - } + // means this point could get included in this cluster + // SESAME_DEBUG("No concept drift occurs(t <= T), insert tha point + // into the leaf node..."); + break; + // Normally insert the data point into the tree leafNode without + // concept drift + } else { + // concept drift adaption + // SESAME_DEBUG("Concept drift occurs(t > T), the current leaf node + // capacity reaches the threshold T"); + // timerMeter.clusterUpdateAccMeasure(); + return backwardEvolution(curNode, node); + // timerMeter.clusterUpdateEndMeasure(); + // break; + } + } else { + // timerMeter.dataInsertAccMeasure(); + curNode = CalcClosestNode(childrenNode, center).first; + // timerMeter.dataInsertEndMeasure(); } - return curNode; + } + return curNode; } void ClusteringFeaturesTree::Init() { root_->tree = shared_from_this(); } -void ClusteringFeaturesTree::Remove(NodePtr node) -{ - auto parent = node->parent; - if (parent != nullptr) parent->RemoveChild(node); - const auto [first, last] = - std::ranges::remove_if(clusters_, [node](auto &cluster) { return cluster == node; }); - clusters_.erase(first, last); +void ClusteringFeaturesTree::Remove(NodePtr node) { + auto parent = node->parent; + if (parent != nullptr) + parent->RemoveChild(node); + const auto [first, last] = std::ranges::remove_if( + clusters_, [node](auto &cluster) { return cluster == node; }); + clusters_.erase(first, last); } -void ClusteringFeaturesTree::ForEach(std::function func) -{ - std::queue queue; - queue.push(root_); - while (!queue.empty()) - { - auto node = queue.front(); - queue.pop(); - func(node); - for (auto &child : node->children) - { - queue.push(child); - } +void ClusteringFeaturesTree::ForEach(std::function func) { + std::queue queue; + queue.push(root_); + while (!queue.empty()) { + auto node = queue.front(); + queue.pop(); + func(node); + for (auto &child : node->children) { + queue.push(child); } + } } template -ClusteringFeaturesTree::NodePtr ClusteringFeaturesTree::backwardEvolution(NodePtr node, T input) -{ - if (node->parent == nullptr) - { // means current node is root node - // l <= L, create a new leaf node and insert the point into it(root - // change) - auto newRoot = GenericFactory::New(shared_from_this(), dim); - newRoot->AddChild(node); +ClusteringFeaturesTree::NodePtr +ClusteringFeaturesTree::backwardEvolution(NodePtr node, T input) { + if (node->parent == nullptr) { // means current node is root node + // l <= L, create a new leaf node and insert the point into it(root + // change) + auto newRoot = GenericFactory::New(shared_from_this(), dim); + newRoot->AddChild(node); - auto newNode = GenericFactory::New(shared_from_this(), dim); - newRoot->AddChild(newNode); - newRoot->cf = node->cf; - newRoot->index = leafMask++; - // here we need to remove the old root and add the new one into the - // leafnodes set update the parent node - newNode->Update(input, true); - clusters_.push_back(newRoot); - root_ = newRoot; - return newNode; - } - else - { - auto parent = node->parent; - auto newNode = GenericFactory::New(shared_from_this(), dim); - parent->AddChild(newNode); - newNode->Update(input, false); - if (parent->children.size() < max_leaf_nodes) - { - // whether the number of CFs(clusters_) in the current leaf node is - // lower thant threshold L l <= L, create a new leaf node and insert - // the point into it update the parent node and all nodes on the path - // to root node - parent->Update(input, true); + auto newNode = GenericFactory::New(shared_from_this(), dim); + newRoot->AddChild(newNode); + newRoot->cf = node->cf; + newRoot->index = leafMask++; + // here we need to remove the old root and add the new one into the + // leafnodes set update the parent node + newNode->Update(input, true); + clusters_.push_back(newRoot); + root_ = newRoot; + return newNode; + } else { + auto parent = node->parent; + auto newNode = GenericFactory::New(shared_from_this(), dim); + parent->AddChild(newNode); + newNode->Update(input, false); + if (parent->children.size() < max_leaf_nodes) { + // whether the number of CFs(clusters_) in the current leaf node is + // lower thant threshold L l <= L, create a new leaf node and insert + // the point into it update the parent node and all nodes on the path + // to root node + parent->Update(input, true); + } else { + // l > L, parent node of the current leaf node capacity reaches the + // threshold L, split a new parent node from the old one + bool nodeIsClus = true; + while (true) { + NodePtr parParent; + if (parent->parent == nullptr) { + // if the parent node is the root, we need to create a new root as + // a parParent + parParent = GenericFactory::New(shared_from_this(), dim); + // parParent->children = root_->children; + root_ = parParent; + // since the parent node's nls has not been updated by the point, + // so we directly copy the nls in parent node to the parParent one + parParent->cf = parent->cf; + parParent->index = leafMask++; + parParent->AddChild(parent); + } else { + // if the parent node is not the root, we can get the parParent + // one directly + parParent = parent->parent; } - else - { - // l > L, parent node of the current leaf node capacity reaches the - // threshold L, split a new parent node from the old one - bool nodeIsClus = true; - while (true) - { - NodePtr parParent; - if (parent->parent == nullptr) - { - // if the parent node is the root, we need to create a new root as - // a parParent - parParent = GenericFactory::New(shared_from_this(), dim); - // parParent->children = root_->children; - root_ = parParent; - // since the parent node's nls has not been updated by the point, - // so we directly copy the nls in parent node to the parParent one - parParent->cf = parent->cf; - parParent->index = leafMask++; - parParent->AddChild(parent); - } - else - { - // if the parent node is not the root, we can get the parParent - // one directly - parParent = parent->parent; - } - // we need to create a new parent node since the old one has to - // split - auto newParentA = GenericFactory::New(shared_from_this(), dim); - // insert the new parent into the allNode list - // we also need to insert the new parent node into the clusterNode - // list if its children is a leaf node. - if (parent->children[0]->IsLeaf()) - { - newParentA->index = leafMask++; - clusters_.push_back(newParentA); - } - // we only create a new parent rather and keep the old parent node - // as the split two sub-nodes so we need to refresh the old parent - // node as a blank one and treat it as a new parent B - - // parent->parent = parParent; // link the parparent node and the new - // // created new parent A - parParent->AddChild(newParentA); - // clean cf of the old parent node and initialize the cf of new - // parent A (ls and ss all have d number of 0) - // split the child nodes of the old parent nodes - std::vector broNodes; - parent->children.swap(broNodes); - auto adjMatrix = CalcAdjMatrix(broNodes); // calculate the distance between - // each two brother nodes - // choose two farthest CFs as seedA and seedB - int seedA = 0, seedB = 0; - double maxDis = 0; - for (int i = 0; i < broNodes.size(); i++) - { - for (int j = i; j < broNodes.size(); j++) - { - if (maxDis < adjMatrix[i][j]) - { - seedA = i, seedB = j; - maxDis = adjMatrix[i][j]; - } - } - } - // insert the child node into the nearest seed(A / B) - for (auto node : broNodes) - { - node->ClearParents(); - } - // insert seedA node into new parent A and link them - newParentA->AddChild(broNodes[seedA]); - newParentA->Update(broNodes[seedA]); - // insert seed B node into new parent B and link them - parent->AddChild(broNodes[seedB]); - parent->Update(broNodes[seedB]); - // if other one brother node is near seed A then split it into new - // parent A, otherwise new parent B. - for (int i = 0; i < broNodes.size(); i++) - { - if (i != seedA && i != seedB) - { - if (adjMatrix[i][seedA] < adjMatrix[i][seedB]) - { - newParentA->AddChild(broNodes[i]); - newParentA->Update(broNodes[i]); // since the brother nodes list - // contains the one we insert our - // point, so after this function, the - // parent node's nls are also updated. - } - else - { - parent->AddChild(broNodes[i]); - parent->Update(broNodes[i]); - } - } - } - // if the current node(parent) is a cluster nodes, then we need to - // update the nls of its parent using new point. we only update the - // parparent in the first loop. - if (nodeIsClus) - { - parParent->Update(input, true); - } + // we need to create a new parent node since the old one has to + // split + auto newParentA = GenericFactory::New(shared_from_this(), dim); + // insert the new parent into the allNode list + // we also need to insert the new parent node into the clusterNode + // list if its children is a leaf node. + if (parent->children[0]->IsLeaf()) { + newParentA->index = leafMask++; + clusters_.push_back(newParentA); + } + // we only create a new parent rather and keep the old parent node + // as the split two sub-nodes so we need to refresh the old parent + // node as a blank one and treat it as a new parent B - if (parParent->children.size() <= max_in_nodes) - { - // b < B, remove the old node and insert the new nodeA and nodeB - // into the parent node. - break; - } - else - { - // b >= B, parent node of the current interior node capacity - // reaches the threshold B. - node = node->parent; - parent = parParent; - nodeIsClus = false; - } + // parent->parent = parParent; // link the parparent node and the new + // // created new parent A + parParent->AddChild(newParentA); + // clean cf of the old parent node and initialize the cf of new + // parent A (ls and ss all have d number of 0) + // split the child nodes of the old parent nodes + std::vector broNodes; + parent->children.swap(broNodes); + auto adjMatrix = + CalcAdjMatrix(broNodes); // calculate the distance between + // each two brother nodes + // choose two farthest CFs as seedA and seedB + int seedA = 0, seedB = 0; + double maxDis = 0; + for (int i = 0; i < broNodes.size(); i++) { + for (int j = i; j < broNodes.size(); j++) { + if (maxDis < adjMatrix[i][j]) { + seedA = i, seedB = j; + maxDis = adjMatrix[i][j]; + } + } + } + // insert the child node into the nearest seed(A / B) + for (auto node : broNodes) { + node->ClearParents(); + } + // insert seedA node into new parent A and link them + newParentA->AddChild(broNodes[seedA]); + newParentA->Update(broNodes[seedA]); + // insert seed B node into new parent B and link them + parent->AddChild(broNodes[seedB]); + parent->Update(broNodes[seedB]); + // if other one brother node is near seed A then split it into new + // parent A, otherwise new parent B. + for (int i = 0; i < broNodes.size(); i++) { + if (i != seedA && i != seedB) { + if (adjMatrix[i][seedA] < adjMatrix[i][seedB]) { + newParentA->AddChild(broNodes[i]); + newParentA->Update( + broNodes[i]); // since the brother nodes list + // contains the one we insert our + // point, so after this function, the + // parent node's nls are also updated. + } else { + parent->AddChild(broNodes[i]); + parent->Update(broNodes[i]); } + } } - return newNode; + // if the current node(parent) is a cluster nodes, then we need to + // update the nls of its parent using new point. we only update the + // parparent in the first loop. + if (nodeIsClus) { + parParent->Update(input, true); + } + + if (parParent->children.size() <= max_in_nodes) { + // b < B, remove the old node and insert the new nodeA and nodeB + // into the parent node. + break; + } else { + // b >= B, parent node of the current interior node capacity + // reaches the threshold B. + node = node->parent; + parent = parParent; + nodeIsClus = false; + } + } } + return newNode; + } } -std::vector &ClusteringFeaturesTree::clusters() -{ - return clusters_; +std::vector & +ClusteringFeaturesTree::clusters() { + return clusters_; } -std::string ClusteringFeaturesTree::Serialize() -{ - std::deque q; - std::deque dep; - std::string s; - q.push_back(root_); - dep.push_back(0); - while (!q.empty()) - { - auto x = q.front(); - auto d = dep.front(); - q.pop_front(); - dep.pop_front(); - s += x->Serialize(d); - for (auto child : x->children) - { - q.push_front(child); - dep.push_front(d + 1); - } +std::string ClusteringFeaturesTree::Serialize() { + std::deque q; + std::deque dep; + std::string s; + q.push_back(root_); + dep.push_back(0); + while (!q.empty()) { + auto x = q.front(); + auto d = dep.front(); + q.pop_front(); + dep.pop_front(); + s += x->Serialize(d); + for (auto child : x->children) { + q.push_front(child); + dep.push_front(d + 1); } - return s; + } + return s; } ClusteringFeaturesList::ClusteringFeaturesList(const SesameParam ¶m) - : dim(param.dim), distance_threshold(param.distance_threshold) -{} + : dim(param.dim), distance_threshold(param.distance_threshold) {} ClusteringFeaturesList::~ClusteringFeaturesList() {} -ClusteringFeaturesList::NodePtr ClusteringFeaturesList::Insert(PointPtr point) -{ - if (clusters_.empty()) - { - auto node = GenericFactory::New(point); - clusters_.push_back(node); - return node; - } - else - { - auto [node, dist] = CalcClosestNode(clusters_, point); - if (dist >= distance_threshold) - { - node = GenericFactory::New(dim); - clusters_.push_back(node); - } - node->Update(point); - return node; +ClusteringFeaturesList::NodePtr ClusteringFeaturesList::Insert(PointPtr point) { + if (clusters_.empty()) { + auto node = GenericFactory::New(point); + clusters_.push_back(node); + return node; + } else { + auto [node, dist] = CalcClosestNode(clusters_, point); + if (dist >= distance_threshold) { + node = GenericFactory::New(dim); + clusters_.push_back(node); } + node->Update(point); + return node; + } } -ClusteringFeaturesList::NodePtr ClusteringFeaturesList::Insert(NodePtr node) -{ - clusters_.push_back(node); - return node; +ClusteringFeaturesList::NodePtr ClusteringFeaturesList::Insert(NodePtr node) { + clusters_.push_back(node); + return node; } -std::vector &ClusteringFeaturesList::clusters() -{ - return clusters_; +std::vector & +ClusteringFeaturesList::clusters() { + return clusters_; } -void ClusteringFeaturesList::Remove(NodePtr node) -{ - auto it = std::find(clusters_.begin(), clusters_.end(), node); - if (it != clusters_.end()) - { - clusters_.erase(it); - } +void ClusteringFeaturesList::Remove(NodePtr node) { + auto it = std::find(clusters_.begin(), clusters_.end(), node); + if (it != clusters_.end()) { + clusters_.erase(it); + } } -} // namespace SESAME +} // namespace SESAME diff --git a/src/Algorithm/DataStructure/Cache.cpp b/src/Algorithm/DataStructure/Cache.cpp index de7d0113..d407b771 100644 --- a/src/Algorithm/DataStructure/Cache.cpp +++ b/src/Algorithm/DataStructure/Cache.cpp @@ -6,88 +6,77 @@ #include #include #include -bool cmp(SESAME::DPNodePtr &c1, SESAME::DPNodePtr &c2) { return c1->GetRho() > c2->GetRho(); } -SESAME::Cache::Cache(int num, double a, double lamd, double r) -{ - std::vector clu(num); - this->buffer = clu; - this->num = num; - this->pnum = 0; - this->size = 0; - this->a = a; - this->lamd = lamd; - this->r = r; +bool cmp(SESAME::DPNodePtr &c1, SESAME::DPNodePtr &c2) { + return c1->GetRho() > c2->GetRho(); +} +SESAME::Cache::Cache(int num, double a, double lamd, double r) { + std::vector clu(num); + this->buffer = clu; + this->num = num; + this->pnum = 0; + this->size = 0; + this->a = a; + this->lamd = lamd; + this->r = r; } SESAME::Cache::~Cache() = default; -SESAME::Cache::Cache() = default; -SESAME::DPNodePtr SESAME::Cache::add(SESAME::PointPtr &p, double startTime) -{ - this->pnum++; - double dis = FLT_MAX; - double minDis = FLT_MAX; - SESAME::DPNodePtr nn; - for (int i = 0; i < size; i++) - { - dis = p->L2Dist(buffer[i]->GetCenter()); - if (dis < minDis) - { - minDis = dis; - nn = buffer[i]; - } - } - if (minDis <= r) - { - double coef = pow(a, lamd * double(startTime - nn->GetLastTime())); - nn->add(coef, startTime); - return nn; - } - else - { - SESAME::DPNodePtr c = std::make_shared(p, startTime); - buffer[size] = c; - size++; - return c; +SESAME::Cache::Cache() = default; +SESAME::DPNodePtr SESAME::Cache::add(SESAME::PointPtr &p, double startTime) { + this->pnum++; + double dis = FLT_MAX; + double minDis = FLT_MAX; + SESAME::DPNodePtr nn; + for (int i = 0; i < size; i++) { + dis = p->L2Dist(buffer[i]->GetCenter()); + if (dis < minDis) { + minDis = dis; + nn = buffer[i]; } + } + if (minDis <= r) { + double coef = pow(a, lamd * double(startTime - nn->GetLastTime())); + nn->add(coef, startTime); + return nn; + } else { + SESAME::DPNodePtr c = std::make_shared(p, startTime); + buffer[size] = c; + size++; + return c; + } } bool SESAME::Cache::isFull() { return pnum == num; } -void SESAME::Cache::compDeltaRho(double time) -{ - std::vector blankNode(size); - clus = blankNode; - for (int i = 0; i < size; i++) - { - buffer[i]->SetRho( - (float)(pow(a, lamd * (time - buffer[i]->GetLastTime())) * buffer[i]->GetRho())); - clus[i] = buffer[i]; - } - sort(clus.begin(), clus.end(), cmp); +void SESAME::Cache::compDeltaRho(double time) { + std::vector blankNode(size); + clus = blankNode; + for (int i = 0; i < size; i++) { + buffer[i]->SetRho((float)(pow(a, lamd * (time - buffer[i]->GetLastTime())) * + buffer[i]->GetRho())); + clus[i] = buffer[i]; + } + sort(clus.begin(), clus.end(), cmp); - double dis = 0; - clus[0]->SetDelta(0); - for (int i = 1; i < size; i++) - { - DPNodePtr cc = clus[i]; - auto minDis = DBL_MAX; - for (int j = i - 1; j >= 0; j--) - { - dis = cc->GetCenter()->L2Dist(clus[j]->GetCenter()); - if (minDis > dis) - { - minDis = dis; - cc->SetDep(clus[j]); - } - } - cc->SetDelta(minDis); - if (clus[0]->GetDelta() < minDis) - { - clus[0]->SetDelta(minDis); - } + double dis = 0; + clus[0]->SetDelta(0); + for (int i = 1; i < size; i++) { + DPNodePtr cc = clus[i]; + auto minDis = DBL_MAX; + for (int j = i - 1; j >= 0; j--) { + dis = cc->GetCenter()->L2Dist(clus[j]->GetCenter()); + if (minDis > dis) { + minDis = dis; + cc->SetDep(clus[j]); + } } + cc->SetDelta(minDis); + if (clus[0]->GetDelta() < minDis) { + clus[0]->SetDelta(minDis); + } + } } -void SESAME::Cache::getDPTree(double minRho, double minDelta, SESAME::DPTreePtr &dpTree, - SESAME::OutPtr &outs, std::unordered_set &clusters) -{ - dpTree->Init(clus, size, minRho, minDelta, outs, clusters); +void SESAME::Cache::getDPTree(double minRho, double minDelta, + SESAME::DPTreePtr &dpTree, SESAME::OutPtr &outs, + std::unordered_set &clusters) { + dpTree->Init(clus, size, minRho, minDelta, outs, clusters); } int SESAME::Cache::GetNum() { return num; } void SESAME::Cache::SetNum(int num) { Cache::num = num; } @@ -100,8 +89,12 @@ void SESAME::Cache::SetLamd(double lamd) { Cache::lamd = lamd; } double SESAME::Cache::GetR() { return r; } void SESAME::Cache::SetR(double r) { Cache::r = r; } std::vector &SESAME::Cache::GetBuffer() { return buffer; } -void SESAME::Cache::SetBuffer(std::vector &buffer) { Cache::buffer = buffer; } +void SESAME::Cache::SetBuffer(std::vector &buffer) { + Cache::buffer = buffer; +} std::vector &SESAME::Cache::GetClus() { return clus; } -void SESAME::Cache::SetClus(std::vector &clus) { Cache::clus = clus; } +void SESAME::Cache::SetClus(std::vector &clus) { + Cache::clus = clus; +} int SESAME::Cache::GetPnum() { return pnum; } void SESAME::Cache::SetPnum(int pnum) { Cache::pnum = pnum; } diff --git a/src/Algorithm/DataStructure/CharacteristicsVector.cpp b/src/Algorithm/DataStructure/CharacteristicsVector.cpp index 80489c41..65b55e02 100644 --- a/src/Algorithm/DataStructure/CharacteristicsVector.cpp +++ b/src/Algorithm/DataStructure/CharacteristicsVector.cpp @@ -4,123 +4,122 @@ #include SESAME::CharacteristicVector::CharacteristicVector() {} -SESAME::CharacteristicVector::CharacteristicVector(int updateTime, int removeTime, double Density, - int label, bool isSporadic, double dl, double dm) -{ - this->updateTime = updateTime; - this->densityUpdateTime = updateTime; - this->removeTime = removeTime; - this->gridDensity = Density; - this->densityUpdateTime = updateTime; - this->label = label; - this->isSporadic = isSporadic; - if (this->isSparse(dl)) - this->attribute = SPARSE; - else if (this->isDense(dm)) - this->attribute = DENSE; - else - this->attribute = TRANSITIONAL; - this->attChange = false; +SESAME::CharacteristicVector::CharacteristicVector(int updateTime, + int removeTime, + double Density, int label, + bool isSporadic, double dl, + double dm) { + this->updateTime = updateTime; + this->densityUpdateTime = updateTime; + this->removeTime = removeTime; + this->gridDensity = Density; + this->densityUpdateTime = updateTime; + this->label = label; + this->isSporadic = isSporadic; + if (this->isSparse(dl)) + this->attribute = SPARSE; + else if (this->isDense(dm)) + this->attribute = DENSE; + else + this->attribute = TRANSITIONAL; + this->attChange = false; } -bool SESAME::CharacteristicVector::isSparse(double dl) -{ - if (this->gridDensity <= dl) - return true; - else - return false; +bool SESAME::CharacteristicVector::isSparse(double dl) { + if (this->gridDensity <= dl) + return true; + else + return false; } -bool SESAME::CharacteristicVector::isDense(double dm) -{ - if (this->gridDensity >= dm) - return true; - else - return false; +bool SESAME::CharacteristicVector::isDense(double dm) { + if (this->gridDensity >= dm) + return true; + else + return false; } -bool SESAME::CharacteristicVector::isTransitional(double dm, double dl) -{ - if (this->gridDensity >= dl && this->gridDensity <= dm) - return true; - else - return false; +bool SESAME::CharacteristicVector::isTransitional(double dm, double dl) { + if (this->gridDensity >= dl && this->gridDensity <= dm) + return true; + else + return false; } -double SESAME::CharacteristicVector::getCurrGridDensity(int NowTime, double lambda) -{ - return pow(lambda, (NowTime - this->updateTime)) * this->gridDensity; +double SESAME::CharacteristicVector::getCurrGridDensity(int NowTime, + double lambda) { + return pow(lambda, (NowTime - this->updateTime)) * this->gridDensity; } // Landmark window -double SESAME::CharacteristicVector::getCurrGridDensity() { return this->gridDensity; } +double SESAME::CharacteristicVector::getCurrGridDensity() { + return this->gridDensity; +} -void SESAME::CharacteristicVector::densityWithNew(int NowTime, double decayFactor) -{ - // Update the density grid's density - this->gridDensity = getCurrGridDensity(NowTime, decayFactor) + 1.0; - // System.out.println(densityOfG); - this->densityUpdateTime = NowTime; +void SESAME::CharacteristicVector::densityWithNew(int NowTime, + double decayFactor) { + // Update the density grid's density + this->gridDensity = getCurrGridDensity(NowTime, decayFactor) + 1.0; + // System.out.println(densityOfG); + this->densityUpdateTime = NowTime; } // Landmark -void SESAME::CharacteristicVector::densityWithNew(int NowTime) -{ - // Update the density grid's density - this->gridDensity = getCurrGridDensity() + 1.0; - // System.out.println(densityOfG); - this->densityUpdateTime = NowTime; +void SESAME::CharacteristicVector::densityWithNew(int NowTime) { + // Update the density grid's density + this->gridDensity = getCurrGridDensity() + 1.0; + // System.out.println(densityOfG); + this->densityUpdateTime = NowTime; } -void SESAME::CharacteristicVector::UpdateAllDensity(int NowTime, double decayFactor, double dl, - double dm) -{ - // record the last attribute - int lastAtt = this->attribute; - // Update the density grid's density - this->gridDensity = getCurrGridDensity(NowTime, decayFactor); - this->densityUpdateTime = NowTime; +void SESAME::CharacteristicVector::UpdateAllDensity(int NowTime, + double decayFactor, + double dl, double dm) { + // record the last attribute + int lastAtt = this->attribute; + // Update the density grid's density + this->gridDensity = getCurrGridDensity(NowTime, decayFactor); + this->densityUpdateTime = NowTime; - // Evaluate whether the density grid is now SPARSE, DENSE or TRANSITIONAL - if (this->isSparse(dl)) - this->attribute = SPARSE; - else if (this->isDense(dm)) - this->attribute = DENSE; - else - this->attribute = TRANSITIONAL; - // Evaluate whether the density grid attribute has changed and set the attChange flag - // accordingly - if (this->attribute == lastAtt) - this->attChange = false; - else - this->attChange = true; + // Evaluate whether the density grid is now SPARSE, DENSE or TRANSITIONAL + if (this->isSparse(dl)) + this->attribute = SPARSE; + else if (this->isDense(dm)) + this->attribute = DENSE; + else + this->attribute = TRANSITIONAL; + // Evaluate whether the density grid attribute has changed and set the + // attChange flag accordingly + if (this->attribute == lastAtt) + this->attChange = false; + else + this->attChange = true; } // Landmark -void SESAME::CharacteristicVector::UpdateAllDensity(int NowTime, double dl, double dm) -{ - // record the last attribute - int lastAtt = this->attribute; - // Update the density grid's density - this->gridDensity = getCurrGridDensity(); - this->densityUpdateTime = NowTime; +void SESAME::CharacteristicVector::UpdateAllDensity(int NowTime, double dl, + double dm) { + // record the last attribute + int lastAtt = this->attribute; + // Update the density grid's density + this->gridDensity = getCurrGridDensity(); + this->densityUpdateTime = NowTime; - // Evaluate whether the density grid is now SPARSE, DENSE or TRANSITIONAL - if (this->isSparse(dl)) - this->attribute = SPARSE; - else if (this->isDense(dm)) - this->attribute = DENSE; - else - this->attribute = TRANSITIONAL; - // Evaluate whether the density grid attribute has changed and set the attChange flag - // accordingly - if (this->attribute == lastAtt) - this->attChange = false; - else - this->attChange = true; + // Evaluate whether the density grid is now SPARSE, DENSE or TRANSITIONAL + if (this->isSparse(dl)) + this->attribute = SPARSE; + else if (this->isDense(dm)) + this->attribute = DENSE; + else + this->attribute = TRANSITIONAL; + // Evaluate whether the density grid attribute has changed and set the + // attChange flag accordingly + if (this->attribute == lastAtt) + this->attChange = false; + else + this->attChange = true; } -void SESAME::CharacteristicVector::ChangeAttribute(double dl, double dm) -{ - if (this->isSparse(dl)) - this->attribute = SPARSE; - else if (this->isDense(dm)) - this->attribute = DENSE; - else - this->attribute = TRANSITIONAL; +void SESAME::CharacteristicVector::ChangeAttribute(double dl, double dm) { + if (this->isSparse(dl)) + this->attribute = SPARSE; + else if (this->isDense(dm)) + this->attribute = DENSE; + else + this->attribute = TRANSITIONAL; } \ No newline at end of file diff --git a/src/Algorithm/DataStructure/CoresetTree.cpp b/src/Algorithm/DataStructure/CoresetTree.cpp index b3bb3728..e80ecc42 100644 --- a/src/Algorithm/DataStructure/CoresetTree.cpp +++ b/src/Algorithm/DataStructure/CoresetTree.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 19/07/2021. @@ -13,230 +14,189 @@ using namespace std; using namespace SESAME; CoresetTree::CoresetTree(const param_t ¶m) - : param(param), - r(param.seed), - num_buckets(log((double)param.num_points / param.coreset_size) / log(2.0) + 2), - buckets(log((double)param.num_points / param.coreset_size) / log(2.0) + 2) -{} + : param(param), r(param.seed), + num_buckets( + log((double)param.num_points / param.coreset_size) / log(2.0) + 2), + buckets(log((double)param.num_points / param.coreset_size) / log(2.0) + + 2) {} void CoresetTree::Init() {} -vector CoresetTree::Union(const vector &a, const vector &b) -{ - auto n = a.size() + b.size(); - vector c(n, GenericFactory::New(param.dim)); - int choosen = 0; - int j = r.random_uniform(0, n - 1); - if (j < a.size()) - { - c[choosen] = a[j]; +vector CoresetTree::Union(const vector &a, + const vector &b) { + auto n = a.size() + b.size(); + vector c(n, GenericFactory::New(param.dim)); + int choosen = 0; + int j = r.random_uniform(0, n - 1); + if (j < a.size()) { + c[choosen] = a[j]; + } else { + j = j - a.size(); + c[choosen] = b[j]; + } + root = GenericFactory::New(c[choosen++]); + // clusters_ = {root}; + for (auto &i : a) { + root->Update(i); + } + for (auto &i : b) { + root->Update(i); + } + while (choosen < n) { + if (root->costs_sum_sq_dist > 0.0) { + auto leaf = Select(root); + auto center = ChooseCenter(leaf); + Split(leaf, center, choosen); + c[choosen] = center; + } else { + c[choosen] = root->center->copy(); + c[choosen]->index = -1; } - else - { - j = j - a.size(); - c[choosen] = b[j]; - } - root = GenericFactory::New(c[choosen++]); - // clusters_ = {root}; - for (auto &i : a) - { - root->Update(i); - } - for (auto &i : b) - { - root->Update(i); - } - while (choosen < n) - { - if (root->costs_sum_sq_dist > 0.0) - { - auto leaf = Select(root); - auto center = ChooseCenter(leaf); - Split(leaf, center, choosen); - c[choosen] = center; - } - else - { - c[choosen] = root->center->copy(); - c[choosen]->index = -1; - } - ++choosen; - } - return c; + ++choosen; + } + return c; } -CoresetTree::NodePtr CoresetTree::Select(NodePtr node) -{ - while (!node->IsLeaf()) - { - if (node->lc->costs_sum_sq_dist == 0.0 && node->rc->costs_sum_sq_dist == 0.0) - { - if (node->lc->cf.num == 0) - { - node = node->rc; - } - else if (node->rc->cf.num == 0) - { - node = node->lc; - } - else - { - node = r.bernoulli(0.5) ? node->lc : node->rc; - } - } - else - { - node = r.bernoulli(node->lc->costs_sum_sq_dist / node->costs_sum_sq_dist) ? node->lc - : node->rc; - } +CoresetTree::NodePtr CoresetTree::Select(NodePtr node) { + while (!node->IsLeaf()) { + if (node->lc->costs_sum_sq_dist == 0.0 && + node->rc->costs_sum_sq_dist == 0.0) { + if (node->lc->cf.num == 0) { + node = node->rc; + } else if (node->rc->cf.num == 0) { + node = node->lc; + } else { + node = r.bernoulli(0.5) ? node->lc : node->rc; + } + } else { + node = r.bernoulli(node->lc->costs_sum_sq_dist / node->costs_sum_sq_dist) + ? node->lc + : node->rc; } - return node; + } + return node; } -PointPtr CoresetTree::ChooseCenter(CoresetTree::NodePtr node) -{ - const int times = 3; - double min_cost = node->costs_sum_sq_dist; - PointPtr best_center = node->points[0]; - PointPtr centroid = node->Centroid(); - for (int j = 0; j < times; ++j) - { - double sum = 0.0; - double random = r.random_uniform(0.0, 1.0); - for (auto &p : node->points) - { - double dist = p->L2Dist(node->center); - sum += dist / node->costs_sum_sq_dist; - if (sum >= random) - { - double cost = min(centroid->L2Dist(node->center), centroid->L2Dist(p)); - if (cost < min_cost) - { - min_cost = cost; - best_center = p; - } - break; - } +PointPtr CoresetTree::ChooseCenter(CoresetTree::NodePtr node) { + const int times = 3; + double min_cost = node->costs_sum_sq_dist; + PointPtr best_center = node->points[0]; + PointPtr centroid = node->Centroid(); + for (int j = 0; j < times; ++j) { + double sum = 0.0; + double random = r.random_uniform(0.0, 1.0); + for (auto &p : node->points) { + double dist = p->L2Dist(node->center); + sum += dist / node->costs_sum_sq_dist; + if (sum >= random) { + double cost = min(centroid->L2Dist(node->center), centroid->L2Dist(p)); + if (cost < min_cost) { + min_cost = cost; + best_center = p; } + break; + } } - return best_center; + } + return best_center; } -void CoresetTree::Split(CoresetTree::NodePtr parent, PointPtr center, int index) -{ - NodePtr lc = GenericFactory::New(center), rc = GenericFactory::New(center); - // clusters_.push_back(lc), clusters_.push_back(rc); - lc->center = parent->center; - lc->parent = parent; - rc->center = parent->center; - rc->parent = parent; - parent->lc = lc, parent->rc = rc; - for (auto &p : parent->points) - { - p->clu_id = index; - if (p->L2Dist(center) < p->L2Dist(parent->center)) - { - rc->Update(p); - } - else - { - lc->Update(p); - } - } - while (parent != nullptr) - { - parent->costs_sum_sq_dist = parent->lc->costs_sum_sq_dist + parent->rc->costs_sum_sq_dist; - parent->costs_sum_dist = parent->lc->costs_sum_dist + parent->rc->costs_sum_dist; - parent = parent->parent; +void CoresetTree::Split(CoresetTree::NodePtr parent, PointPtr center, + int index) { + NodePtr lc = GenericFactory::New(center), + rc = GenericFactory::New(center); + // clusters_.push_back(lc), clusters_.push_back(rc); + lc->center = parent->center; + lc->parent = parent; + rc->center = parent->center; + rc->parent = parent; + parent->lc = lc, parent->rc = rc; + for (auto &p : parent->points) { + p->clu_id = index; + if (p->L2Dist(center) < p->L2Dist(parent->center)) { + rc->Update(p); + } else { + lc->Update(p); } + } + while (parent != nullptr) { + parent->costs_sum_sq_dist = + parent->lc->costs_sum_sq_dist + parent->rc->costs_sum_sq_dist; + parent->costs_sum_dist = + parent->lc->costs_sum_dist + parent->rc->costs_sum_dist; + parent = parent->parent; + } } -CoresetTree::NodePtr CoresetTree::Insert(PointPtr input) -{ - buckets[0].base->push_back(input); - if (buckets[0].base->size() == param.coreset_size) - { - int cur = 0, next = 1; - if (buckets[next].base->size() == 0) - { - buckets[next].base = buckets[cur].base; - buckets[cur].base = GenericFactory::New>(); - } - else - { - buckets[next].spill = buckets[cur].base; - buckets[cur].base = GenericFactory::New>(); - ++cur, ++next; - while (buckets[next].base->size() == param.coreset_size) - { - *buckets[next].spill.get() = - Union(*buckets[cur].base.get(), *buckets[cur].spill.get()); - ++cur, ++next; - } - *buckets[next].spill.get() = Union(*buckets[cur].base.get(), *buckets[cur].spill.get()); - } +CoresetTree::NodePtr CoresetTree::Insert(PointPtr input) { + buckets[0].base->push_back(input); + if (buckets[0].base->size() == param.coreset_size) { + int cur = 0, next = 1; + if (buckets[next].base->size() == 0) { + buckets[next].base = buckets[cur].base; + buckets[cur].base = GenericFactory::New>(); + } else { + buckets[next].spill = buckets[cur].base; + buckets[cur].base = GenericFactory::New>(); + ++cur, ++next; + while (buckets[next].base->size() == param.coreset_size) { + *buckets[next].spill.get() = + Union(*buckets[cur].base.get(), *buckets[cur].spill.get()); + ++cur, ++next; + } + *buckets[next].spill.get() = + Union(*buckets[cur].base.get(), *buckets[cur].spill.get()); } - return root; + } + return root; } -CoresetTree::NodePtr CoresetTree::Insert(NodePtr node) -{ - clusters_.push_back(node); - return node; +CoresetTree::NodePtr CoresetTree::Insert(NodePtr node) { + clusters_.push_back(node); + return node; } -void CoresetTree::Remove(NodePtr node) -{ - auto it = find(clusters_.begin(), clusters_.end(), node); - if (it != clusters_.end()) - { - clusters_.erase(it); - } +void CoresetTree::Remove(NodePtr node) { + auto it = find(clusters_.begin(), clusters_.end(), node); + if (it != clusters_.end()) { + clusters_.erase(it); + } } -vector &CoresetTree::clusters() -{ - if (!clusters_.empty()) return clusters_; - Points points = nullptr; - if (buckets[num_buckets - 1].base->size() == param.coreset_size) - { - points = buckets[num_buckets - 1].base; +vector &CoresetTree::clusters() { + if (!clusters_.empty()) + return clusters_; + Points points = nullptr; + if (buckets[num_buckets - 1].base->size() == param.coreset_size) { + points = buckets[num_buckets - 1].base; + } else { + int i = 0; + for (; i < num_buckets; ++i) { + if (buckets[i].base->size() == param.coreset_size) { + points = buckets[i].base; + break; + } } - else - { - int i = 0; - for (; i < num_buckets; ++i) - { - if (buckets[i].base->size() == param.coreset_size) - { - points = buckets[i].base; - break; - } - } - for (int j = i + 1; j < num_buckets; ++j) - { - if (buckets[j].base->size() != 0) - { - *buckets[j].spill.get() = Union(*buckets[j].base.get(), *points.get()); - points = buckets[j].spill; - } - } + for (int j = i + 1; j < num_buckets; ++j) { + if (buckets[j].base->size() != 0) { + *buckets[j].spill.get() = Union(*buckets[j].base.get(), *points.get()); + points = buckets[j].spill; + } } - return clusters_ = Points2Nodes(points); + } + return clusters_ = Points2Nodes(points); } -vector CoresetTree::Points2Nodes(CoresetTree::Points points) -{ - vector nodes; - if (points == nullptr) - { - return nodes; - } - for (auto &p : *points.get()) - { - auto node = GenericFactory::New(p); - node->Update(p); - nodes.push_back(node); - } +vector +CoresetTree::Points2Nodes(CoresetTree::Points points) { + vector nodes; + if (points == nullptr) { return nodes; + } + for (auto &p : *points.get()) { + auto node = GenericFactory::New(p); + node->Update(p); + nodes.push_back(node); + } + return nodes; } diff --git a/src/Algorithm/DataStructure/DPNode.cpp b/src/Algorithm/DataStructure/DPNode.cpp index 1ee586b9..d9396605 100644 --- a/src/Algorithm/DataStructure/DPNode.cpp +++ b/src/Algorithm/DataStructure/DPNode.cpp @@ -10,14 +10,13 @@ * DPNode */ static int id = 0; -SESAME::DPNode::DPNode() -{ - this->cid = -1; - this->Cid = -1; - this->lastTime = 0; - this->inactiveTime = 0; - this->num = 0; - this->active = false; +SESAME::DPNode::DPNode() { + this->cid = -1; + this->Cid = -1; + this->lastTime = 0; + this->inactiveTime = 0; + this->num = 0; + this->active = false; } SESAME::DPNode::~DPNode() {} @@ -35,32 +34,41 @@ void SESAME::DPNode::SetDelta(double delta) { DPNode::delta = delta; } SESAME::DPNodePtr &SESAME::DPNode::GetDep() { return dep; } void SESAME::DPNode::SetDep(SESAME::DPNodePtr &dep) { DPNode::dep = dep; } SESAME::PointPtr &SESAME::DPNode::GetCenter() { return center; } -void SESAME::DPNode::SetCenter(SESAME::PointPtr ¢er) { DPNode::center = center; } +void SESAME::DPNode::SetCenter(SESAME::PointPtr ¢er) { + DPNode::center = center; +} double SESAME::DPNode::GetLastTime() { return lastTime; } void SESAME::DPNode::SetLastTime(double last_time) { lastTime = last_time; } bool SESAME::DPNode::IsActive() { return active; } void SESAME::DPNode::SetActive(bool active) { DPNode::active = active; } -std::unordered_set &SESAME::DPNode::GetSucs() { return sucs; } -void SESAME::DPNode::SetSucs(std::unordered_set &sucs) { DPNode::sucs = sucs; } +std::unordered_set &SESAME::DPNode::GetSucs() { + return sucs; +} +void SESAME::DPNode::SetSucs(std::unordered_set &sucs) { + DPNode::sucs = sucs; +} SESAME::ClusterPtr &SESAME::DPNode::GetCluster() { return cluster; } -void SESAME::DPNode::SetCluster(SESAME::ClusterPtr &cluster) { DPNode::cluster = cluster; } +void SESAME::DPNode::SetCluster(SESAME::ClusterPtr &cluster) { + DPNode::cluster = cluster; +} double SESAME::DPNode::GetInactiveTime() { return inactiveTime; } -void SESAME::DPNode::SetInactiveTime(double inactive_time) { inactiveTime = inactive_time; } +void SESAME::DPNode::SetInactiveTime(double inactive_time) { + inactiveTime = inactive_time; +} double SESAME::DPNode::GetDis() { return dis; } void SESAME::DPNode::SetDis(double dis) { DPNode::dis = dis; } -SESAME::DPNode::DPNode(SESAME::PointPtr &p, double time) -{ - this->cid = id++; - this->rho = 1; - this->delta = FLT_MAX; - this->dep = nullptr; - this->center = p->copy(); - this->lastTime = time; - this->active = false; - this->Cid = 0; - this->inactiveTime = 0; - this->dis = 0; +SESAME::DPNode::DPNode(SESAME::PointPtr &p, double time) { + this->cid = id++; + this->rho = 1; + this->delta = FLT_MAX; + this->dep = nullptr; + this->center = p->copy(); + this->lastTime = time; + this->active = false; + this->Cid = 0; + this->inactiveTime = 0; + this->dis = 0; } /** @@ -68,38 +76,44 @@ SESAME::DPNode::DPNode(SESAME::PointPtr &p, double time) */ int SESAME::Cluster::GetLabel() { return label; } void SESAME::Cluster::SetLabel(int label) { Cluster::label = label; } -std::unordered_set &SESAME::Cluster::GetCells() { return cells; } -void SESAME::Cluster::SetCells(std::unordered_set &cells) { Cluster::cells = cells; } +std::unordered_set &SESAME::Cluster::GetCells() { + return cells; +} +void SESAME::Cluster::SetCells(std::unordered_set &cells) { + Cluster::cells = cells; +} SESAME::Cluster::Cluster() { this->label = -1; } SESAME::Cluster::Cluster(int label) { this->label = label; } -void SESAME::Cluster::add(DPNodePtr &node) -{ - assert(node); - this->cells.insert(node); +void SESAME::Cluster::add(DPNodePtr &node) { + assert(node); + this->cells.insert(node); } void SESAME::Cluster::remove(DPNodePtr &node) { this->cells.erase(node); } -void SESAME::DPNode::insert(double time) -{ - this->rho++; - this->lastTime = time; -} -void SESAME::DPNode::add(double coef, double time) -{ - this->rho = coef * this->rho + 1; - this->lastTime = time; -} -void SESAME::DPNode::addSuccessor(SESAME::DPNodePtr &node) { this->sucs.insert(node); } -void SESAME::DPNode::removeSuccessor(SESAME::DPNodePtr &node) { this->sucs.erase(node); } +void SESAME::DPNode::insert(double time) { + this->rho++; + this->lastTime = time; +} +void SESAME::DPNode::add(double coef, double time) { + this->rho = coef * this->rho + 1; + this->lastTime = time; +} +void SESAME::DPNode::addSuccessor(SESAME::DPNodePtr &node) { + this->sucs.insert(node); +} +void SESAME::DPNode::removeSuccessor(SESAME::DPNodePtr &node) { + this->sucs.erase(node); +} bool SESAME::DPNode::hasSuccessor() { return !sucs.empty(); } -double SESAME::DPNode::getDisTo(SESAME::DPNodePtr &node) -{ - double distance = 0; - for (int i = 0; i < this->center->getDimension(); i++) - { - distance = - distance + pow(node->center->getFeatureItem(i) - this->center->getFeatureItem(i), 2); - } - return sqrt(distance); -} -SESAME::DPNodePtr SESAME::DPNode::copy() { return std::make_shared(*this); } +double SESAME::DPNode::getDisTo(SESAME::DPNodePtr &node) { + double distance = 0; + for (int i = 0; i < this->center->getDimension(); i++) { + distance = distance + pow(node->center->getFeatureItem(i) - + this->center->getFeatureItem(i), + 2); + } + return sqrt(distance); +} +SESAME::DPNodePtr SESAME::DPNode::copy() { + return std::make_shared(*this); +} diff --git a/src/Algorithm/DataStructure/DPTree.cpp b/src/Algorithm/DataStructure/DPTree.cpp index 6a6cf3a6..a371ea3b 100644 --- a/src/Algorithm/DataStructure/DPTree.cpp +++ b/src/Algorithm/DataStructure/DPTree.cpp @@ -4,656 +4,537 @@ #include #include +#include #include #include #include +using namespace std; + /** * DPTree */ SESAME::DPTree::DPTree() {} SESAME::DPTree::~DPTree() {} -SESAME::DPTree::DPTree(int num, double CluR) -{ - this->num = num; - std::vector clus(this->num); - this->Clus = clus; - this->CluR = CluR; - this->cluLabel = 0; +SESAME::DPTree::DPTree(int num, double CluR) { + this->num = num; + std::vector clus(this->num); + this->Clus = clus; + this->CluR = CluR; + this->cluLabel = 0; } -void SESAME::DPTree::insert(SESAME::DPNodePtr &cc, int opt) -{ - cc->SetActive(true); - Clus[size] = cc; - size++; +void SESAME::DPTree::insert(SESAME::DPNodePtr &cc, int opt) { + cc->SetActive(true); + Clus[size] = cc; + size++; - if (opt == 0) - { - adjustNoOpt(size - 1); - } - if (opt == 1) - { - adjustOpt1(size - 1); - } + if (opt == 0) { + adjustNoOpt(size - 1); + } + if (opt == 1) { + adjustOpt1(size - 1); + } - if (opt == 2) - { - adjust(size - 1); - } + if (opt == 2) { + adjust(size - 1); + } - if (opt == -1) - { - adjustNoDelta(size - 1); - } + if (opt == -1) { + adjustNoDelta(size - 1); + } - if (size == num) - { - SESAME_DEBUG("lack of DPTree nodes"); - } + if (size == num) { + SESAME_DEBUG("lack of DPTree nodes"); + } } -void SESAME::DPTree::Init(std::vector &clus, int size, double minRho, - double minDelta, SESAME::OutPtr &outs, - std::unordered_set &clusters) -{ - this->minDelta = minDelta; - Clus[0] = clus[0]; - SESAME::ClusterPtr cluster = std::make_shared(cluLabel++); - cluster->add(Clus[0]); - Clus[0]->SetCluster(cluster); - clusters.insert(cluster); - int i = 1; +void SESAME::DPTree::Init(std::vector &clus, int size, + double minRho, double minDelta, SESAME::OutPtr &outs, + std::unordered_set &clusters) { + this->minDelta = minDelta; + Clus[0] = clus[0]; + SESAME::ClusterPtr cluster = std::make_shared(cluLabel++); + cluster->add(Clus[0]); + Clus[0]->SetCluster(cluster); + clusters.insert(cluster); + int i = 1; - for (; i < size && clus[i]->GetRho() >= minRho; i++) - { - Clus[i] = clus[i]; - std::unordered_set sucs = Clus[i]->GetDep()->GetSucs(); - sucs.insert(Clus[i]); - if (Clus[i]->GetDelta() > minDelta) - { - SESAME::ClusterPtr c = std::make_shared(cluLabel++); - ; - c->add(Clus[i]); - Clus[i]->SetCluster(c); - clusters.insert(c); - } - else - { - Clus[i]->GetDep()->GetCluster()->add(Clus[i]); - Clus[i]->SetCluster(Clus[i]->GetDep()->GetCluster()); - } - } - double maxDelta = 0; - for (int j = 1; j < size && j < i; j++) - { - if (maxDelta < Clus[j]->GetDelta()) - { - maxDelta = Clus[j]->GetDelta(); - } - } - Clus[0]->SetDelta(maxDelta); - this->size = i; - for (; i < size; i++) - { - outs->insert(clus[i]); - } + for (; i < size && clus[i]->GetRho() >= minRho; i++) { + Clus[i] = clus[i]; + std::unordered_set sucs = Clus[i]->GetDep()->GetSucs(); + sucs.insert(Clus[i]); + if (Clus[i]->GetDelta() > minDelta) { + SESAME::ClusterPtr c = std::make_shared(cluLabel++); + ; + c->add(Clus[i]); + Clus[i]->SetCluster(c); + clusters.insert(c); + } else { + Clus[i]->GetDep()->GetCluster()->add(Clus[i]); + Clus[i]->SetCluster(Clus[i]->GetDep()->GetCluster()); + } + } + double maxDelta = 0; + for (int j = 1; j < size && j < i; j++) { + if (maxDelta < Clus[j]->GetDelta()) { + maxDelta = Clus[j]->GetDelta(); + } + } + Clus[0]->SetDelta(maxDelta); + this->size = i; + for (; i < size; i++) { + outs->insert(clus[i]); + } } -SESAME::DPNodePtr SESAME::DPTree::findNN(SESAME::PointPtr p, double coef, int opt, double time) -{ - int index = 0; - double dis = 0; - auto minDis = DBL_MAX; - for (int i = 0; i < size; i++) - { - Clus[i]->SetRho(Clus[i]->GetRho() * coef); - dis = p->L2Dist(Clus[i]->GetCenter()); - Clus[i]->SetDis(dis); - if (dis < minDis) - { - minDis = dis; - index = i; - } - } +SESAME::DPNodePtr SESAME::DPTree::findNN(SESAME::PointPtr p, double coef, + int opt, double time) { + int index = 0; + double dis = 0; + auto minDis = DBL_MAX; + for (int i = 0; i < size; i++) { + Clus[i]->SetRho(Clus[i]->GetRho() * coef); + dis = p->L2Dist(Clus[i]->GetCenter()); + Clus[i]->SetDis(dis); + if (dis < minDis) { + minDis = dis; + index = i; + } + } - p->setMinDist(minDis); - auto cc = Clus[index]; - if (minDis <= CluR) - { - Clus[index]->insert(time); - if (opt == 0) - { - adjustNoOpt(index); - } - if (opt == 1) - { - adjustOpt1(index); - } + p->setMinDist(minDis); + auto cc = Clus[index]; + if (minDis <= CluR) { + Clus[index]->insert(time); + if (opt == 0) { + adjustNoOpt(index); + } + if (opt == 1) { + adjustOpt1(index); + } - if (opt == 2) - { - adjust(index); - } + if (opt == 2) { + adjust(index); + } - if (opt == -1) - { - adjustNoDelta(index); - } + if (opt == -1) { + adjustNoDelta(index); } - return cc; + } + return cc; } -void SESAME::DPTree::adjustNoDelta(int index) -{ - Clus[0]->SetDelta(DBL_MAX); - auto clu = Clus[index]; +void SESAME::DPTree::adjustNoDelta(int index) { + Clus[0]->SetDelta(DBL_MAX); + auto clu = Clus[index]; - if (index > 0) - { - for (int i = index - 1; i >= 0; i--) - { - if (clu->GetRho() > Clus[i]->GetRho()) - { - Clus[i + 1] = Clus[i]; - Clus[i] = clu; - } - else - { - break; - } - } - } + if (index > 0) { + for (int i = index - 1; i >= 0; i--) { + if (clu->GetRho() > Clus[i]->GetRho()) { + Clus[i + 1] = Clus[i]; + Clus[i] = clu; + } else { + break; + } + } + } } -void SESAME::DPTree::adjustNoOpt(int index) -{ - Clus[0]->SetDelta(DBL_MAX); - auto clu = Clus[index]; - int position = index; +void SESAME::DPTree::adjustNoOpt(int index) { + Clus[0]->SetDelta(DBL_MAX); + auto clu = Clus[index]; + int position = index; - if (index > 0) - { - for (int i = index - 1; i >= 0; i--) - { - if (clu->GetRho() > Clus[i]->GetRho()) - { - Clus[i + 1] = Clus[i]; - Clus[i] = clu; - position = i; - } - else - { - break; - } - } - } - if (Clus[0] == clu) - { - clu->SetDelta(DBL_MAX); - } + if (index > 0) { + for (int i = index - 1; i >= 0; i--) { + if (clu->GetRho() > Clus[i]->GetRho()) { + Clus[i + 1] = Clus[i]; + Clus[i] = clu; + position = i; + } else { + break; + } + } + } + if (Clus[0] == clu) { + clu->SetDelta(DBL_MAX); + } - computeDeltaNoOpt(position); - computeHeadDelta(); + computeDeltaNoOpt(position); + computeHeadDelta(); } -void SESAME::DPTree::computeDeltaNoOpt(int index) -{ - auto clu = Clus[index]; - if (clu->GetDep() != nullptr) - { - clu->GetDep()->removeSuccessor(clu); - } - double dis = 0; - clu->SetDelta(DBL_MAX); - for (int i = size - 1; i >= 0; i--) - { - if (i < index) - { - dis = clu->GetCenter()->L2Dist(Clus[i]->GetCenter()); - if (clu->GetDelta() > dis) - { - if (clu->GetDep() != nullptr) - { - clu->GetDep()->removeSuccessor(clu); - } - clu->SetDelta(dis); - clu->SetDep(Clus[i]); - Clus[i]->addSuccessor(clu); - } +void SESAME::DPTree::computeDeltaNoOpt(int index) { + auto clu = Clus[index]; + if (clu->GetDep() != nullptr) { + clu->GetDep()->removeSuccessor(clu); + } + double dis = 0; + clu->SetDelta(DBL_MAX); + for (int i = size - 1; i >= 0; i--) { + if (i < index) { + dis = clu->GetCenter()->L2Dist(Clus[i]->GetCenter()); + if (clu->GetDelta() > dis) { + if (clu->GetDep() != nullptr) { + clu->GetDep()->removeSuccessor(clu); } - if (i > index) - { - if (Clus[i]->GetDelta() > dis) - { - if (Clus[i]->GetDep() != nullptr) - { - Clus[i]->GetDep()->removeSuccessor(Clus[i]); - } - Clus[i]->SetDep(clu); - clu->addSuccessor(Clus[i]); - Clus[i]->SetDelta(dis); - } + clu->SetDelta(dis); + clu->SetDep(Clus[i]); + Clus[i]->addSuccessor(clu); + } + } + if (i > index) { + if (Clus[i]->GetDelta() > dis) { + if (Clus[i]->GetDep() != nullptr) { + Clus[i]->GetDep()->removeSuccessor(Clus[i]); } + Clus[i]->SetDep(clu); + clu->addSuccessor(Clus[i]); + Clus[i]->SetDelta(dis); + } } + } } -void SESAME::DPTree::adjustOpt1(int index) -{ - Clus[0]->SetDelta(DBL_MAX); - auto clu = Clus[index]; - if (clu->GetDep() != nullptr && clu->GetDep()->GetRho() < clu->GetRho()) - { - clu->GetDep()->removeSuccessor(clu); - clu->SetDelta(DBL_MAX); - } - int position = index; - double dis = 0; +void SESAME::DPTree::adjustOpt1(int index) { + Clus[0]->SetDelta(DBL_MAX); + auto clu = Clus[index]; + if (clu->GetDep() != nullptr && clu->GetDep()->GetRho() < clu->GetRho()) { + clu->GetDep()->removeSuccessor(clu); + clu->SetDelta(DBL_MAX); + } + int position = index; + double dis = 0; - if (index > 0) - { - for (int i = index - 1; i >= 0; i--) - { - if (clu->GetRho() > Clus[i]->GetRho()) - { - dis = Clus[i]->getDisTo(clu); - if (dis <= Clus[i]->GetDelta()) - { - if (Clus[i]->GetDep() != nullptr) - { - Clus[i]->GetDep()->removeSuccessor(Clus[i]); - } - Clus[i]->SetDep(clu); - clu->addSuccessor(Clus[i]); - Clus[i]->SetDelta(dis); - } - Clus[i + 1] = Clus[i]; - Clus[i] = clu; - position = i; - } - else - { - break; - } + if (index > 0) { + for (int i = index - 1; i >= 0; i--) { + if (clu->GetRho() > Clus[i]->GetRho()) { + dis = Clus[i]->getDisTo(clu); + if (dis <= Clus[i]->GetDelta()) { + if (Clus[i]->GetDep() != nullptr) { + Clus[i]->GetDep()->removeSuccessor(Clus[i]); + } + Clus[i]->SetDep(clu); + clu->addSuccessor(Clus[i]); + Clus[i]->SetDelta(dis); } - } - if (Clus[0] == clu) - { - clu->SetDelta(DBL_MAX); - position = 0; - } + Clus[i + 1] = Clus[i]; + Clus[i] = clu; + position = i; + } else { + break; + } + } + } + if (Clus[0] == clu) { + clu->SetDelta(DBL_MAX); + position = 0; + } - if (position != 0 && (clu->GetDep() == nullptr || clu->GetRho() > clu->GetDep()->GetRho())) - { - clu->SetDelta(DBL_MAX); + if (position != 0 && + (clu->GetDep() == nullptr || clu->GetRho() > clu->GetDep()->GetRho())) { + clu->SetDelta(DBL_MAX); - computeDeltaF1(position); - } - computeHeadDelta(); + computeDeltaF1(position); + } + computeHeadDelta(); } -void SESAME::DPTree::computeDeltaF1(int index) -{ - auto clu = Clus[index]; - if (clu->GetDep() != nullptr) - { - clu->GetDep()->removeSuccessor(clu); - } - clu->SetDelta(DBL_MAX); - if (index == 0) - { - return; - } - double dis = 0; +void SESAME::DPTree::computeDeltaF1(int index) { + auto clu = Clus[index]; + if (clu->GetDep() != nullptr) { + clu->GetDep()->removeSuccessor(clu); + } + clu->SetDelta(DBL_MAX); + if (index == 0) { + return; + } + double dis = 0; - // with one optimization + // with one optimization - for (int i = index - 1; i >= 0; i--) - { - dis = clu->GetCenter()->L2Dist(Clus[i]->GetCenter()); - if (dis < clu->GetDelta()) - { - clu->SetDep(Clus[i]); - clu->SetDelta(dis); - } + for (int i = index - 1; i >= 0; i--) { + dis = clu->GetCenter()->L2Dist(Clus[i]->GetCenter()); + if (dis < clu->GetDelta()) { + clu->SetDep(Clus[i]); + clu->SetDelta(dis); } + } - if (clu->GetDep() != nullptr) - { - clu->GetDep()->addSuccessor(clu); - } + if (clu->GetDep() != nullptr) { + clu->GetDep()->addSuccessor(clu); + } } -void SESAME::DPTree::adjust(int index) -{ - Clus[0]->SetDelta(DBL_MAX); - auto clu = Clus[index]; - if (clu->GetDep() != nullptr && clu->GetDep()->GetRho() < clu->GetRho()) - { - clu->GetDep()->removeSuccessor(clu); - clu->SetDelta(DBL_MAX); - } - int position = index; - double dis = 0; +void SESAME::DPTree::adjust(int index) { + Clus[0]->SetDelta(DBL_MAX); + auto clu = Clus[index]; + if (clu->GetDep() != nullptr && clu->GetDep()->GetRho() < clu->GetRho()) { + clu->GetDep()->removeSuccessor(clu); + clu->SetDelta(DBL_MAX); + } + int position = index; + double dis = 0; - if (index > 0) - { - for (int i = index - 1; i >= 0; i--) - { - if (clu->GetRho() > Clus[i]->GetRho()) - { - if (Clus[i]->GetDelta() > Clus[i]->GetDis() - clu->GetDis()) - { - dis = Clus[i]->getDisTo(clu); - if (dis < Clus[i]->GetDelta()) - { - if (Clus[i]->GetDep() != nullptr) - { - Clus[i]->GetDep()->removeSuccessor(Clus[i]); - } - Clus[i]->SetDep(clu); - clu->addSuccessor(Clus[i]); - Clus[i]->SetDelta(dis); - } - } - Clus[i + 1] = Clus[i]; - Clus[i] = clu; - position = i; - } - else - { - break; + if (index > 0) { + for (int i = index - 1; i >= 0; i--) { + if (clu->GetRho() > Clus[i]->GetRho()) { + if (Clus[i]->GetDelta() > Clus[i]->GetDis() - clu->GetDis()) { + dis = Clus[i]->getDisTo(clu); + if (dis < Clus[i]->GetDelta()) { + if (Clus[i]->GetDep() != nullptr) { + Clus[i]->GetDep()->removeSuccessor(Clus[i]); } + Clus[i]->SetDep(clu); + clu->addSuccessor(Clus[i]); + Clus[i]->SetDelta(dis); + } } - } - if (Clus[0] == clu) - { - clu->SetDelta(DBL_MAX); - } - if (position != 0 && (clu->GetDep() == nullptr || clu->GetRho() > clu->GetDep()->GetRho())) - { - clu->SetDelta(DBL_MAX); + Clus[i + 1] = Clus[i]; + Clus[i] = clu; + position = i; + } else { + break; + } + } + } + if (Clus[0] == clu) { + clu->SetDelta(DBL_MAX); + } + if (position != 0 && + (clu->GetDep() == nullptr || clu->GetRho() > clu->GetDep()->GetRho())) { + clu->SetDelta(DBL_MAX); - computeDelta(position); - } - computeHeadDelta(); + computeDelta(position); + } + computeHeadDelta(); } -void SESAME::DPTree::computeHeadDelta() -{ - auto clu = Clus[0]; - if (clu->GetDep() != nullptr) - { - clu->GetDep()->removeSuccessor(clu); - } +void SESAME::DPTree::computeHeadDelta() { + auto clu = Clus[0]; + if (clu->GetDep() != nullptr) { + clu->GetDep()->removeSuccessor(clu); + } - double maxValue = 0; - double secondValue = 0; - for (int i = 1; i < size; i++) - { - if (maxValue < Clus[i]->GetDelta()) - { - secondValue = maxValue; - maxValue = Clus[i]->GetDelta(); - } - else if (secondValue < Clus[i]->GetDelta()) - { - secondValue = Clus[i]->GetDelta(); - } - } - if (maxValue > 3 * secondValue) - { - clu->SetDelta(maxValue); - } - else - { - clu->SetDelta((maxValue + secondValue) / 2); - } + double maxValue = 0; + double secondValue = 0; + for (int i = 1; i < size; i++) { + if (maxValue < Clus[i]->GetDelta()) { + secondValue = maxValue; + maxValue = Clus[i]->GetDelta(); + } else if (secondValue < Clus[i]->GetDelta()) { + secondValue = Clus[i]->GetDelta(); + } + } + if (maxValue > 3 * secondValue) { + clu->SetDelta(maxValue); + } else { + clu->SetDelta((maxValue + secondValue) / 2); + } } -void SESAME::DPTree::computeDelta(int index) -{ - auto clu = Clus[index]; - if (clu->GetDep() != nullptr) - { - clu->GetDep()->removeSuccessor(clu); - } - clu->SetDelta(DBL_MAX); - if (index == 0) - { - return; - } - double dis = 0; +void SESAME::DPTree::computeDelta(int index) { + auto clu = Clus[index]; + if (clu->GetDep() != nullptr) { + clu->GetDep()->removeSuccessor(clu); + } + clu->SetDelta(DBL_MAX); + if (index == 0) { + return; + } + double dis = 0; - for (int i = index - 1; i >= 0; i--) - { - if (clu->GetDelta() > Clus[i]->GetDis() - clu->GetDis()) - { - dis = clu->GetCenter()->L2Dist(Clus[i]->GetCenter()); + for (int i = index - 1; i >= 0; i--) { + if (clu->GetDelta() > Clus[i]->GetDis() - clu->GetDis()) { + dis = clu->GetCenter()->L2Dist(Clus[i]->GetCenter()); - if (dis < clu->GetDelta()) - { - clu->SetDep(Clus[i]); - clu->SetDelta(dis); - } - } + if (dis < clu->GetDelta()) { + clu->SetDep(Clus[i]); + clu->SetDelta(dis); + } } + } - if (clu->GetDep() != nullptr) - { - clu->GetDep()->addSuccessor(clu); - } + if (clu->GetDep() != nullptr) { + clu->GetDep()->addSuccessor(clu); + } } -void SESAME::DPTree::deleteInact(SESAME::OutPtr &outres, double minRho, double time) -{ - for (int i = size - 1; i > 0; i--) - { - if (Clus[i]->GetRho() < minRho) - { - auto cc = Clus[i]; - Clus[i] = nullptr; - size--; - cc->SetActive(false); - cc->SetInactiveTime(time); - std::unordered_set cells = cc->GetDep()->GetSucs(); - cells.erase(cc); - cc->GetCluster()->remove(cc); - outres->insert(cc); - } - else - { - break; - } - } - if (size > 0 && Clus[0]->GetRho() < minRho) - { - auto cc = Clus[0]; - Clus[0] = nullptr; - size--; - cc->SetActive(false); - cc->SetInactiveTime(time); - cc->GetCluster()->remove(cc); - outres->insert(cc); - } +void SESAME::DPTree::deleteInact(SESAME::OutPtr &outres, double minRho, + double time) { + for (int i = size - 1; i > 0; i--) { + if (Clus[i]->GetRho() < minRho) { + auto cc = Clus[i]; + Clus[i] = nullptr; + size--; + cc->SetActive(false); + cc->SetInactiveTime(time); + std::unordered_set cells = cc->GetDep()->GetSucs(); + cells.erase(cc); + cc->GetCluster()->remove(cc); + outres->insert(cc); + } else { + break; + } + } + if (size > 0 && Clus[0]->GetRho() < minRho) { + auto cc = Clus[0]; + Clus[0] = nullptr; + size--; + cc->SetActive(false); + cc->SetInactiveTime(time); + cc->GetCluster()->remove(cc); + outres->insert(cc); + } } -double SESAME::DPTree::computeAlpha(double minDelta) -{ - std::vector deltas(size); - for (int i = 0; i < size; i++) - { - deltas[i] = Clus[i]->GetDelta(); - } - sort(deltas.begin(), deltas.end()); - double delta1; - double delta2; - int m = 0; - int n = 0; - double avg; - double up = 0; - double down = 0; - int i = 0; - for (i = 0; i < size - 1 && deltas[i] < minDelta; i++) - { - n++; - down += deltas[i]; - } - delta1 = deltas[i - 1]; - delta2 = deltas[i]; - for (; i < size; i++) - { - m++; - up += deltas[i]; - } - avg = up + down; - avg = avg / (m + n); - up = up / m; - down = down / n; - double alpha = (up * (down - delta1) * (m * up + delta1)) / - (avg * avg * (delta1 - up) * (n - 1) + (down - delta1) * up * (m * up + delta1)); - double alpha2 = - ((delta2 - down) * up * (m * up - delta2)) / - ((delta2 - down) * up * (m * up - delta2) + avg * avg * (up - delta2) * (n + 1)); - if (alpha < alpha2) - { - return (alpha + alpha2) / 2; - } - else - { - return 0; - } +double SESAME::DPTree::computeAlpha(double minDelta) { + std::vector deltas(size); + for (int i = 0; i < size; i++) { + deltas[i] = Clus[i]->GetDelta(); + } + sort(deltas.begin(), deltas.end()); + double delta1; + double delta2; + int m = 0; + int n = 0; + double avg; + double up = 0; + double down = 0; + int i = 0; + for (i = 0; i < size - 1 && deltas[i] < minDelta; i++) { + n++; + down += deltas[i]; + } + delta1 = deltas[i - 1]; + delta2 = deltas[i]; + for (; i < size; i++) { + m++; + up += deltas[i]; + } + avg = up + down; + avg = avg / (m + n); + up = up / m; + down = down / n; + double alpha = (up * (down - delta1) * (m * up + delta1)) / + (avg * avg * (delta1 - up) * (n - 1) + + (down - delta1) * up * (m * up + delta1)); + double alpha2 = ((delta2 - down) * up * (m * up - delta2)) / + ((delta2 - down) * up * (m * up - delta2) + + avg * avg * (up - delta2) * (n + 1)); + if (alpha < alpha2) { + return (alpha + alpha2) / 2; + } else { + return 0; + } } -double SESAME::DPTree::adjustMinDelta(double alpha) -{ - if (size < 2) - { - return 0; - } - std::vector deltas(size); - for (int i = 0; i < size; i++) - { - deltas[i] = Clus[i]->GetDelta(); - } - sort(deltas.begin(), deltas.end()); - int m = 0; - int n = 0; - double avg = 0; - double up = 0; - double down = 0; - up = deltas[size - 1]; - for (int i = 0; i < size - 1; i++) - { - down += deltas[i]; - } - n = size - 1; +double SESAME::DPTree::adjustMinDelta(double alpha) { + if (size < 2) { + return 0; + } + std::vector deltas(size); + for (int i = 0; i < size; i++) { + deltas[i] = Clus[i]->GetDelta(); + } + sort(deltas.begin(), deltas.end()); + int m = 0; + int n = 0; + double avg = 0; + double up = 0; + double down = 0; + up = deltas[size - 1]; + for (int i = 0; i < size - 1; i++) { + down += deltas[i]; + } + n = size - 1; - m = 1; - avg = (up + down) / (m + n); - double score = fun(alpha, up / m, down / n, avg); - int index = size - 2; + m = 1; + avg = (up + down) / (m + n); + double score = fun(alpha, up / m, down / n, avg); + int index = size - 2; + up += deltas[index]; + m++; + down -= deltas[index]; + n--; + double scoredown = fun(alpha, up / m, down / n, avg); + while (score > scoredown && index > 0) { + score = scoredown; + index--; up += deltas[index]; m++; down -= deltas[index]; n--; - double scoredown = fun(alpha, up / m, down / n, avg); - while (score > scoredown && index > 0) - { - score = scoredown; - index--; - up += deltas[index]; - m++; - down -= deltas[index]; - n--; - scoredown = fun(alpha, up / m, down / n, avg); - } - return (deltas[index + 1] + deltas[index]) / 2; + scoredown = fun(alpha, up / m, down / n, avg); + } + return (deltas[index + 1] + deltas[index]) / 2; } -double SESAME::DPTree::fun(double alpha, double upavg, double downavg, double avg) -{ - return alpha * (avg / upavg) + (1 - alpha) * (downavg / avg); +double SESAME::DPTree::fun(double alpha, double upavg, double downavg, + double avg) { + return alpha * (avg / upavg) + (1 - alpha) * (downavg / avg); } -void SESAME::DPTree::adjustCluster(std::unordered_set &clusters) -{ - std::vector set; // +void SESAME::DPTree::adjustCluster( + std::unordered_set &clusters) { + std::vector set; // - if (Clus[0] == nullptr) - { - // SESAME_DEBUG("there is no cluser-cell, r is very small"); - // SESAME_DEBUG("please adjust your r parameter larger"); - } - else if (Clus[0]->GetCluster() == nullptr) - { // there is new cluster center appearing + if (Clus[0] == nullptr) { + // SESAME_DEBUG("there is no cluser-cell, r is very small"); + // SESAME_DEBUG("please adjust your r parameter larger"); + } else if (Clus[0]->GetCluster() == + nullptr) { // there is new cluster center appearing + auto cluster = std::make_shared(cluLabel++); + clusters.insert(cluster); + cluster->add(Clus[0]); + Clus[0]->SetCluster(cluster); + set.push_back(cluster); + } else { + set.push_back(Clus[0]->GetCluster()); + } + + for (int i = 1; i < size; i++) { + if (Clus[i]->GetDelta() >= minDelta) { // clusters split or new cluster + // appears + if (Clus[i]->GetDep() == nullptr) { + SESAME_DEBUG("error contains null dep"); + } + if (Clus[i]->GetCluster() == + Clus[i]->GetDep()->GetCluster()) { // clusters split + auto c1 = Clus[i]->GetCluster(); + auto c2 = std::make_shared(cluLabel++); + if (c1 != nullptr) { + c1->remove(Clus[i]); + } + c2->add(Clus[i]); + Clus[i]->SetCluster(c2); + clusters.insert(c2); + set.push_back(c2); + } + if (Clus[i]->GetCluster() == nullptr) { // new cluster appears auto cluster = std::make_shared(cluLabel++); clusters.insert(cluster); - cluster->add(Clus[0]); - Clus[0]->SetCluster(cluster); + cluster->add(Clus[i]); + Clus[i]->SetCluster(cluster); set.push_back(cluster); - } - else - { - set.push_back(Clus[0]->GetCluster()); - } - - for (int i = 1; i < size; i++) - { - if (Clus[i]->GetDelta() >= minDelta) - { // clusters split or new cluster - // appears - if (Clus[i]->GetDep() == nullptr) - { - SESAME_DEBUG("error contains null dep"); - } - if (Clus[i]->GetCluster() == Clus[i]->GetDep()->GetCluster()) - { // clusters split - auto c1 = Clus[i]->GetCluster(); - auto c2 = std::make_shared(cluLabel++); - if (c1 != nullptr) - { - c1->remove(Clus[i]); - } - c2->add(Clus[i]); - Clus[i]->SetCluster(c2); - clusters.insert(c2); - set.push_back(c2); - } - if (Clus[i]->GetCluster() == nullptr) - { // new cluster appears - auto cluster = std::make_shared(cluLabel++); - clusters.insert(cluster); - cluster->add(Clus[i]); - Clus[i]->SetCluster(cluster); - set.push_back(cluster); - } - else - { - bool flag = false; - for (auto c : set) - { - if (c->GetLabel() == Clus[i]->GetCluster()->GetLabel()) flag = true; - } - if (flag) - { // this is because when - // minDelta become smaller - Clus[i]->GetCluster()->remove(Clus[i]); - auto cluster = std::make_shared(cluLabel++); - clusters.insert(cluster); - cluster->add(Clus[i]); - Clus[i]->SetCluster(cluster); - set.push_back(cluster); - } - else - { - set.push_back(Clus[i]->GetCluster()); - } - } + } else { + bool flag = false; + for (auto c : set) { + if (c->GetLabel() == Clus[i]->GetCluster()->GetLabel()) + flag = true; } - else - { - if (Clus[i]->GetDep() == nullptr) - { - SESAME_DEBUG("error contains null dep"); - } - if (Clus[i]->GetCluster() != Clus[i]->GetDep()->GetCluster()) - { - auto c1 = Clus[i]->GetCluster(); - auto c2 = Clus[i]->GetDep()->GetCluster(); - if (c1 != nullptr) - { - c1->remove(Clus[i]); - } - c2->add(Clus[i]); - Clus[i]->SetCluster(c2); - } + if (flag) { // this is because when + // minDelta become smaller + Clus[i]->GetCluster()->remove(Clus[i]); + auto cluster = std::make_shared(cluLabel++); + clusters.insert(cluster); + cluster->add(Clus[i]); + Clus[i]->SetCluster(cluster); + set.push_back(cluster); + } else { + set.push_back(Clus[i]->GetCluster()); + } + } + } else { + if (Clus[i]->GetDep() == nullptr) { + SESAME_DEBUG("error contains null dep"); + } + if (Clus[i]->GetCluster() != Clus[i]->GetDep()->GetCluster()) { + auto c1 = Clus[i]->GetCluster(); + auto c2 = Clus[i]->GetDep()->GetCluster(); + if (c1 != nullptr) { + c1->remove(Clus[i]); } + c2->add(Clus[i]); + Clus[i]->SetCluster(c2); + } } + } } double SESAME::DPTree::GetLastTime() { return lastTime; } void SESAME::DPTree::SetLastTime(double last_time) { lastTime = last_time; } @@ -662,7 +543,9 @@ void SESAME::DPTree::SetSize(int size) { DPTree::size = size; } int SESAME::DPTree::GetNum() { return num; } void SESAME::DPTree::SetNum(int num) { DPTree::num = num; } std::vector &SESAME::DPTree::GetClus() { return Clus; } -void SESAME::DPTree::SetClus(std::vector &clus) { Clus = clus; } +void SESAME::DPTree::SetClus(std::vector &clus) { + Clus = clus; +} double SESAME::DPTree::GetA() { return a; } void SESAME::DPTree::SetA(double a) { DPTree::a = a; } double SESAME::DPTree::GetLamd() { return lamd; } diff --git a/src/Algorithm/DataStructure/DataStructureFactory.cpp b/src/Algorithm/DataStructure/DataStructureFactory.cpp index 4f5013c0..78ac2ee3 100644 --- a/src/Algorithm/DataStructure/DataStructureFactory.cpp +++ b/src/Algorithm/DataStructure/DataStructureFactory.cpp @@ -7,108 +7,87 @@ #include -SESAME::TreeNodePtr SESAME::DataStructureFactory::createTreeNode() -{ - return std::make_shared(); +SESAME::TreeNodePtr SESAME::DataStructureFactory::createTreeNode() { + return std::make_shared(); } -void SESAME::DataStructureFactory::clearTreeNode(SESAME::TreeNodePtr treeNode) { treeNode.reset(); } - -SESAME::PointPtr SESAME::DataStructureFactory::createPoint(int dim) -{ - return std::make_shared(dim); +void SESAME::DataStructureFactory::clearTreeNode(SESAME::TreeNodePtr treeNode) { + treeNode.reset(); } -SESAME::PointPtr SESAME::DataStructureFactory::createPoint(int index, double weight, int dim, - double cost) -{ - return std::make_shared(dim, index, weight, cost); +SESAME::MicroClusterPtr +SESAME::DataStructureFactory::createMicroCluster(int dim, int id) { + return std::make_shared(dim, id); } - -SESAME::PointPtr SESAME::DataStructureFactory::createPoint(int index, double weight, int dim, - double cost, int timestamp) -{ - return std::make_shared(dim, index, weight, cost, timestamp); +SESAME::MicroClusterPtr SESAME::DataStructureFactory::createMicroCluster( + int dim, int id, PointPtr dataPoint, double radius) { + return std::make_shared(dim, id, dataPoint, radius); } - -void SESAME::DataStructureFactory::clearPoint(SESAME::PointPtr point) { point.reset(); } - -SESAME::MicroClusterPtr SESAME::DataStructureFactory::createMicroCluster(int dim, int id) -{ - return std::make_shared(dim, id); -} -SESAME::MicroClusterPtr SESAME::DataStructureFactory::createMicroCluster(int dim, int id, - PointPtr dataPoint, - double radius) -{ - return std::make_shared(dim, id, dataPoint, radius); -} -void SESAME::DataStructureFactory::clearMicroCluster(SESAME::MicroClusterPtr microCluster) -{ - microCluster.reset(); +void SESAME::DataStructureFactory::clearMicroCluster( + SESAME::MicroClusterPtr microCluster) { + microCluster.reset(); } SESAME::SnapshotPtr SESAME::DataStructureFactory::createSnapshot( - SESAME::MicroClusters &otherMicroClusters, int elapsedTime) -{ - return std::make_shared(otherMicroClusters, elapsedTime); + SESAME::MicroClusters &otherMicroClusters, int elapsedTime) { + return std::make_shared(otherMicroClusters, elapsedTime); } -void SESAME::DataStructureFactory::clearSnapshot(SESAME::SnapshotPtr Snapshot) { Snapshot.reset(); } -SESAME::CFTreePtr SESAME::DataStructureFactory::createCFTree() -{ - return std::make_shared(0, 0, 0); +void SESAME::DataStructureFactory::clearSnapshot(SESAME::SnapshotPtr Snapshot) { + Snapshot.reset(); +} +SESAME::CFTreePtr SESAME::DataStructureFactory::createCFTree() { + return std::make_shared(0, 0, 0); } -SESAME::NodePtr SESAME::DataStructureFactory::createNode() -{ - return std::make_shared(); +SESAME::NodePtr SESAME::DataStructureFactory::createNode() { + return std::make_shared(); } -SESAME::DPNodePtr SESAME::DataStructureFactory::createDPNode() -{ - return std::make_shared(); +SESAME::DPNodePtr SESAME::DataStructureFactory::createDPNode() { + return std::make_shared(); } -SESAME::DPNodePtr SESAME::DataStructureFactory::createDPNode(SESAME::PointPtr p, double time) -{ - return std::make_shared(p, time); +SESAME::DPNodePtr SESAME::DataStructureFactory::createDPNode(SESAME::PointPtr p, + double time) { + return std::make_shared(p, time); } -SESAME::OutPtr SESAME::DataStructureFactory::createOutlierReservoir() -{ - return std::make_shared(); +SESAME::OutPtr SESAME::DataStructureFactory::createOutlierReservoir() { + return std::make_shared(); } -SESAME::CachePtr SESAME::DataStructureFactory::creatCache() -{ - return std::make_shared(); +SESAME::CachePtr SESAME::DataStructureFactory::creatCache() { + return std::make_shared(); } -SESAME::OutPtr SESAME::DataStructureFactory::createOutlierReservoir(double r, double a, double lamd) -{ - return std::make_shared(r, a, lamd); +SESAME::OutPtr +SESAME::DataStructureFactory::createOutlierReservoir(double r, double a, + double lamd) { + return std::make_shared(r, a, lamd); } -SESAME::CachePtr SESAME::DataStructureFactory::creatCache(int num, double a, double lamd, double r) -{ - return std::make_shared(num, a, lamd, r); +SESAME::CachePtr SESAME::DataStructureFactory::creatCache(int num, double a, + double lamd, + double r) { + return std::make_shared(num, a, lamd, r); } -SESAME::DPTreePtr SESAME::DataStructureFactory::createDPTree(int num, double r) -{ - return std::make_shared(num, r); +SESAME::DPTreePtr SESAME::DataStructureFactory::createDPTree(int num, + double r) { + return std::make_shared(num, r); } -SESAME::MicroClusterPairPtr SESAME::DataStructureFactory::createMicroClusterPair( - MicroClusterPtr microCluster1, MicroClusterPtr microCluster2) -{ - return std::make_shared(microCluster1, microCluster2); +SESAME::MicroClusterPairPtr +SESAME::DataStructureFactory::createMicroClusterPair( + MicroClusterPtr microCluster1, MicroClusterPtr microCluster2) { + return std::make_shared(microCluster1, + microCluster2); } -void SESAME::DataStructureFactory::clearMicroClusterPair(MicroClusterPairPtr microClusterPair) -{ - microClusterPair.reset(); +void SESAME::DataStructureFactory::clearMicroClusterPair( + MicroClusterPairPtr microClusterPair) { + microClusterPair.reset(); } -SESAME::AdjustedWeightPtr SESAME::DataStructureFactory::createAdjustedWeight(double weight, - int pointTime, - timespec pointTime0) -{ - return std::make_shared(weight, pointTime, pointTime0); -} -void SESAME::DataStructureFactory::clearAdjustedWeight(SESAME::AdjustedWeightPtr adjustedWeight) -{ - adjustedWeight.reset(); +SESAME::AdjustedWeightPtr +SESAME::DataStructureFactory::createAdjustedWeight(double weight, int pointTime, + timespec pointTime0) { + return std::make_shared(weight, pointTime, + pointTime0); +} +void SESAME::DataStructureFactory::clearAdjustedWeight( + SESAME::AdjustedWeightPtr adjustedWeight) { + adjustedWeight.reset(); } diff --git a/src/Algorithm/DataStructure/DensityGrid.cpp b/src/Algorithm/DataStructure/DensityGrid.cpp index 064bc3c2..1a957b06 100644 --- a/src/Algorithm/DataStructure/DensityGrid.cpp +++ b/src/Algorithm/DataStructure/DensityGrid.cpp @@ -5,68 +5,65 @@ #include SESAME::DensityGrid::DensityGrid() {} -SESAME::DensityGrid::DensityGrid(const std::vector& coordin) - : dims(coordin.size()), coordinates(coordin), isVisited(false) -{} +SESAME::DensityGrid::DensityGrid(const std::vector &coordin) + : dims(coordin.size()), coordinates(coordin), isVisited(false) {} -SESAME::DensityGrid::DensityGrid(DensityGrid const& grid) - : dims(grid.dims), coordinates(grid.coordinates), isVisited(false) -{} +SESAME::DensityGrid::DensityGrid(DensityGrid const &grid) + : dims(grid.dims), coordinates(grid.coordinates), isVisited(false) {} /** - * Generates a vector of neighbours for this density grid by varying each coordinate - * by one in either direction. Does not test whether the generated neighbours are valid as - * DensityGrid is not aware of the number of partitions in each dim. + * Generates a vector of neighbours for this density grid by varying each + * coordinate by one in either direction. Does not test whether the generated + * neighbours are valid as DensityGrid is not aware of the number of partitions + * in each dim. * * @return a vector of neighbours for this density grid */ -std::vector SESAME::DensityGrid::getNeighbours() const -{ - // SESAME_INFO("Obtain neighbours"); - std::vector neighbours; - std::vector hCoord = this->coordinates; - for (int i = 0; i < this->dims; i++) - { - hCoord[i] = hCoord[i] - 1; - DensityGrid grid(hCoord); - neighbours.push_back(grid); +std::vector SESAME::DensityGrid::getNeighbours() const { + // SESAME_INFO("Obtain neighbours"); + std::vector neighbours; + std::vector hCoord = this->coordinates; + for (int i = 0; i < this->dims; i++) { + hCoord[i] = hCoord[i] - 1; + DensityGrid grid(hCoord); + neighbours.push_back(grid); - hCoord[i] = hCoord[i] + 2; - DensityGrid grid2(hCoord); - neighbours.push_back(grid2); + hCoord[i] = hCoord[i] + 2; + DensityGrid grid2(hCoord); + neighbours.push_back(grid2); - hCoord[i] = hCoord[i] - 1; - } + hCoord[i] = hCoord[i] - 1; + } - return neighbours; + return neighbours; } /** - * Provides the probability of the argument instance belonging to the density grid in question. + * Provides the probability of the argument instance belonging to the density + * grid in question. * - * @return 1.0 if the instance equals the density grid's coordinates; 0.0 otherwise. + * @return 1.0 if the instance equals the density grid's coordinates; 0.0 + * otherwise. */ -double SESAME::DensityGrid::getInclusionProbability(Point point) -{ - for (int i = 0; i < this->dims; i++) - { - if ((int)point.getFeatureItem(i) != this->coordinates[i]) return 0.0; - } - return 1.0; +double SESAME::DensityGrid::getInclusionProbability(Point point) { + for (int i = 0; i < this->dims; i++) { + if ((int)point.getFeatureItem(i) != this->coordinates[i]) + return 0.0; + } + return 1.0; } -bool SESAME::DensityGrid::operator==(DensityGrid& gridOther) const -{ - if (this == &gridOther) - { - return true; - } - if (this->dims != gridOther.dims) return false; - for (int i = 0; i < this->dims; i++) - { - if (this->coordinates[i] != gridOther.coordinates[i]) return false; - } +bool SESAME::DensityGrid::operator==(DensityGrid &gridOther) const { + if (this == &gridOther) { return true; + } + if (this->dims != gridOther.dims) + return false; + for (int i = 0; i < this->dims; i++) { + if (this->coordinates[i] != gridOther.coordinates[i]) + return false; + } + return true; } \ No newline at end of file diff --git a/src/Algorithm/DataStructure/FeatureVector.cpp b/src/Algorithm/DataStructure/FeatureVector.cpp index 73ba0408..977d1906 100644 --- a/src/Algorithm/DataStructure/FeatureVector.cpp +++ b/src/Algorithm/DataStructure/FeatureVector.cpp @@ -14,60 +14,46 @@ std::vector SESAME::CF::getLS() const { return this->LS; } std::vector SESAME::CF::getSS() const { return this->SS; } -double SESAME::CF::getLSItem(int index) const { return this->getLS().at(index); } +double SESAME::CF::getLSItem(int index) const { + return this->getLS().at(index); +} -double SESAME::CF::getSSItem(int index) const { return this->getSS().at(index); } +double SESAME::CF::getSSItem(int index) const { + return this->getSS().at(index); +} -void SESAME::CF::setLS(std::vector& newLs) -{ - if (this->getLS().empty()) - { - for (double newL : newLs) - { - this->LS.push_back(newL); - } +void SESAME::CF::setLS(std::vector &newLs) { + if (this->getLS().empty()) { + for (double newL : newLs) { + this->LS.push_back(newL); } - else - { - if (this->getLS().size() != newLs.size()) - { - std::cout << "Size Error: CF's LS size: " << this->getLS().size() - << ", newLS's size: " << newLs.size(); - } - else - { - for (int i = 0; i < newLs.size(); i++) - { - this->LS[i] = newLs[i]; - } - } + } else { + if (this->getLS().size() != newLs.size()) { + std::cout << "Size Error: CF's LS size: " << this->getLS().size() + << ", newLS's size: " << newLs.size(); + } else { + for (int i = 0; i < newLs.size(); i++) { + this->LS[i] = newLs[i]; + } } + } } -void SESAME::CF::setSS(std::vector& newSs) -{ - if (this->getSS().empty()) - { - for (double newS : newSs) - { - this->SS.push_back(newS); - } +void SESAME::CF::setSS(std::vector &newSs) { + if (this->getSS().empty()) { + for (double newS : newSs) { + this->SS.push_back(newS); } - else - { - if (this->SS.size() != newSs.size()) - { - std::cout << "Size Error: CF's SS size: " << this->getSS().size() - << ", newSs's size: " << newSs.size(); - } - else - { - for (int i = 0; i < newSs.size(); i++) - { - this->SS[i] = newSs[i]; - } - } + } else { + if (this->SS.size() != newSs.size()) { + std::cout << "Size Error: CF's SS size: " << this->getSS().size() + << ", newSs's size: " << newSs.size(); + } else { + for (int i = 0; i < newSs.size(); i++) { + this->SS[i] = newSs[i]; + } } + } } SESAME::CFPtr SESAME::CF::copy() { return std::make_shared(*this); } int SESAME::CF::getIndex() { return this->index; } diff --git a/src/Algorithm/DataStructure/GridCluster.cpp b/src/Algorithm/DataStructure/GridCluster.cpp index 39a221da..f619b15f 100644 --- a/src/Algorithm/DataStructure/GridCluster.cpp +++ b/src/Algorithm/DataStructure/GridCluster.cpp @@ -8,78 +8,73 @@ SESAME::GridCluster::GridCluster() {} SESAME::GridCluster::GridCluster(int label) { this->clusterLabel = label; } // TODO: if Using this function, be careful when grids are not NULL -SESAME::GridCluster::GridCluster(HashGrids hashMap, int label) -{ - HashGrids::iterator iterW; - for (iterW = hashMap.begin(); iterW != hashMap.end(); iterW++) - { - DensityGrid grid = iterW->first; - bool inside = iterW->second; - // this->grids.insert(std::make_pair(grid, inside)); - putHashGrid(this->grids, grid, inside); - } - this->clusterLabel = label; +SESAME::GridCluster::GridCluster(HashGrids hashMap, int label) { + HashGrids::iterator iterW; + for (iterW = hashMap.begin(); iterW != hashMap.end(); iterW++) { + DensityGrid grid = iterW->first; + bool inside = iterW->second; + // this->grids.insert(std::make_pair(grid, inside)); + putHashGrid(this->grids, grid, inside); + } + this->clusterLabel = label; } /** * @param grid the density grid to add to the cluster */ -void SESAME::GridCluster::addGrid(const DensityGrid& grid) -{ - bool inside = isInside(grid); - auto it1 = grids.find(grid); - if (it1 != grids.end()) - grids[grid] = inside; - else - grids.insert(std::make_pair(grid, inside)); - // Iterate on grids and judge whether they are inside grids or not - for (auto& grid_iter : this->grids) - { - bool inside2U = grid_iter.second; - if (!inside2U) - { - DensityGrid dg2U = grid_iter.first; - grid_iter.second = isInside(dg2U); - } +void SESAME::GridCluster::addGrid(const DensityGrid &grid) { + bool inside = isInside(grid); + auto it1 = grids.find(grid); + if (it1 != grids.end()) + grids[grid] = inside; + else + grids.insert(std::make_pair(grid, inside)); + // Iterate on grids and judge whether they are inside grids or not + for (auto &grid_iter : this->grids) { + bool inside2U = grid_iter.second; + if (!inside2U) { + DensityGrid dg2U = grid_iter.first; + grid_iter.second = isInside(dg2U); } + } } /** * @param dg the density grid to remove from the cluster */ -void SESAME::GridCluster::removeGrid(const DensityGrid& grid) { this->grids.erase(grid); } +void SESAME::GridCluster::removeGrid(const DensityGrid &grid) { + this->grids.erase(grid); +} /** * @param gridClus the GridCluster to be absorbed into this cluster */ -void SESAME::GridCluster::absorbCluster(GridCluster gridCluster) -{ - bool inside; - SESAME::HashGrids newCluster; - // SESAME_INFO("Absorb cluster "<< gridCluster.clusterLabel <<" into cluster - // "<clusterLabel<<"."); - - // Add each density grid from gridCluster into this->grids - auto grid = gridCluster.grids.begin(); - while (grid != gridCluster.grids.end()) - { - // TODO whether they have same grids? - // this->grids.insert(std::make_pair(grid->first, false)); - putHashGrid(this->grids, grid->first, false); - grid++; - } - // SESAME_INFO("...density grids added"); - // Determine which density grids in this.grids are 'inside' and which are 'outside' - auto thisGrid = this->grids.begin(); // mod - while (thisGrid != this->grids.end()) - { - inside = isInside(thisGrid->first); - putHashGrid(newCluster, thisGrid->first, inside); - thisGrid++; - } - this->grids = newCluster; - // SESAME_INFO("...inside/outside determined"); +void SESAME::GridCluster::absorbCluster(GridCluster gridCluster) { + bool inside; + SESAME::HashGrids newCluster; + // SESAME_INFO("Absorb cluster "<< gridCluster.clusterLabel <<" into cluster + // "<clusterLabel<<"."); + + // Add each density grid from gridCluster into this->grids + auto grid = gridCluster.grids.begin(); + while (grid != gridCluster.grids.end()) { + // TODO whether they have same grids? + // this->grids.insert(std::make_pair(grid->first, false)); + putHashGrid(this->grids, grid->first, false); + grid++; + } + // SESAME_INFO("...density grids added"); + // Determine which density grids in this.grids are 'inside' and which are + // 'outside' + auto thisGrid = this->grids.begin(); // mod + while (thisGrid != this->grids.end()) { + inside = isInside(thisGrid->first); + putHashGrid(newCluster, thisGrid->first, inside); + thisGrid++; + } + this->grids = newCluster; + // SESAME_INFO("...inside/outside determined"); } /** @@ -91,17 +86,14 @@ void SESAME::GridCluster::absorbCluster(GridCluster gridCluster) * @param grid the density grid to label as being inside or out * @return TRUE if g is an inside grid, FALSE otherwise */ -bool SESAME::GridCluster::isInside(DensityGrid grid) -{ - std::vector neighbour = grid.getNeighbours(); - for (auto gridNeighbourhood : neighbour) - { - if (this->grids.find(gridNeighbourhood) == this->grids.end()) - { - return false; - } +bool SESAME::GridCluster::isInside(DensityGrid grid) { + std::vector neighbour = grid.getNeighbours(); + for (auto gridNeighbourhood : neighbour) { + if (this->grids.find(gridNeighbourhood) == this->grids.end()) { + return false; } - return true; + } + return true; } /** @@ -114,116 +106,105 @@ bool SESAME::GridCluster::isInside(DensityGrid grid) * @param other the density grid being proposed for addition * @return TRUE if g would be an inside grid, FALSE otherwise */ -bool SESAME::GridCluster::isInside(DensityGrid grid, DensityGrid other) -{ - std::vector neighbour = grid.getNeighbours(); - for (auto gridNeighbourhood : neighbour) - { - if (this->grids.find(gridNeighbourhood) != this->grids.end() && gridNeighbourhood == other) - { - return false; - } +bool SESAME::GridCluster::isInside(DensityGrid grid, DensityGrid other) { + std::vector neighbour = grid.getNeighbours(); + for (auto gridNeighbourhood : neighbour) { + if (this->grids.find(gridNeighbourhood) != this->grids.end() && + gridNeighbourhood == other) { + return false; } - return true; + } + return true; } /** - * Tests a grid cluster for connectedness according to Definition 3.4, Grid Group, from - * Chen and Tu 2007. + * Tests a grid cluster for connectedness according to Definition 3.4, Grid + * Group, from Chen and Tu 2007. * - * Selects one density grid in the grid cluster as a starting point and iterates repeatedly - * through its neighbours until no more density grids in the grid cluster can be visited. + * Selects one density grid in the grid cluster as a starting point and iterates + * repeatedly through its neighbours until no more density grids in the grid + * cluster can be visited. * * @return TRUE if the cluster represent one single grid group; FALSE otherwise. */ -bool SESAME::GridCluster::isConnected() -{ - // TODO A little confused about here - - if (!this->grids.empty()) - { - DensityGrid grid = this->grids.begin()->first; - putHashGrid(this->visited, grid, this->grids.begin()->second); - bool changesMade; - - do - { - changesMade = false; - auto visIter = this->visited.begin(); - HashGrids toAdd; - - while (visIter != this->visited.end() && toAdd.empty()) - { - DensityGrid dg2V = visIter->first; - std::vector neighbour = dg2V.getNeighbours(); - for (auto dg2VNeighbourhood : neighbour) - { - if (this->grids.find(dg2VNeighbourhood) != this->grids.end() && - this->visited.find(dg2VNeighbourhood) == this->visited.end()) - putHashGrid(toAdd, dg2VNeighbourhood, - this->grids.find(dg2VNeighbourhood)->second); - } - visIter++; - } - - if (!toAdd.empty()) - { - HashGrids::iterator gridToAdd; - for (gridToAdd = toAdd.begin(); gridToAdd != toAdd.end(); gridToAdd++) - { - putHashGrid(this->visited, gridToAdd->first, gridToAdd->second); - } - changesMade = true; - } - } while (changesMade); - } +bool SESAME::GridCluster::isConnected() { + // TODO A little confused about here + + if (!this->grids.empty()) { + DensityGrid grid = this->grids.begin()->first; + putHashGrid(this->visited, grid, this->grids.begin()->second); + bool changesMade; + + do { + changesMade = false; + auto visIter = this->visited.begin(); + HashGrids toAdd; + + while (visIter != this->visited.end() && toAdd.empty()) { + DensityGrid dg2V = visIter->first; + std::vector neighbour = dg2V.getNeighbours(); + for (auto dg2VNeighbourhood : neighbour) { + if (this->grids.find(dg2VNeighbourhood) != this->grids.end() && + this->visited.find(dg2VNeighbourhood) == this->visited.end()) + putHashGrid(toAdd, dg2VNeighbourhood, + this->grids.find(dg2VNeighbourhood)->second); + } + visIter++; + } - if (this->visited.size() == this->grids.size()) - { - // SESAME_INFO("The cluster is still connected. "<visited.size()+" of - // "<grids.size()<<" reached."); - return true; - } - else - { - // SESAME_INFO("The cluster is no longer connected. "<visited, gridToAdd->first, gridToAdd->second); + } + changesMade = true; + } + } while (changesMade); + } + + if (this->visited.size() == this->grids.size()) { + // SESAME_INFO("The cluster is still connected. "<visited.size()+" of + // "<grids.size()<<" reached."); + return true; + } else { + // SESAME_INFO("The cluster is no longer connected. + // "<grids.begin(); iterW != this->grids.end(); iterW++) - { - DensityGrid grid = iterW->first; - if (grid.getInclusionProbability(point) == 1.0) return 1.0; - } - return 0.0; +double SESAME::GridCluster::getInclusionProb(Point point) { + HashGrids::iterator iterW; + // Iterate on grids and judge whether they are inside grids or not + for (iterW = this->grids.begin(); iterW != this->grids.end(); iterW++) { + DensityGrid grid = iterW->first; + if (grid.getInclusionProbability(point) == 1.0) + return 1.0; + } + return 0.0; } -bool SESAME::GridCluster::operator==(GridCluster& other) const -{ - bool equal = false; - if (clusterLabel == other.clusterLabel && grids.size() == other.grids.size() && - visited.size() == other.visited.size()) - equal = true; - return equal; +bool SESAME::GridCluster::operator==(GridCluster &other) const { + bool equal = false; + if (clusterLabel == other.clusterLabel && + grids.size() == other.grids.size() && + visited.size() == other.visited.size()) + equal = true; + return equal; } -void SESAME::GridCluster::putHashGrid(HashGrids grids1, const DensityGrid& g, bool inside) -{ - auto it1 = grids1.find(g); - if (it1 != grids1.end()) - it1->second = inside; - else - grids1.insert(std::make_pair(g, inside)); +void SESAME::GridCluster::putHashGrid(HashGrids grids1, const DensityGrid &g, + bool inside) { + auto it1 = grids1.find(g); + if (it1 != grids1.end()) + it1->second = inside; + else + grids1.insert(std::make_pair(g, inside)); } diff --git a/src/Algorithm/DataStructure/MeyersonSketch.cpp b/src/Algorithm/DataStructure/MeyersonSketch.cpp index d1c184a5..82e593f6 100644 --- a/src/Algorithm/DataStructure/MeyersonSketch.cpp +++ b/src/Algorithm/DataStructure/MeyersonSketch.cpp @@ -6,91 +6,78 @@ using namespace std; using namespace SESAME; -MeyersonSketch::MeyersonSketch(const param_t ¶m) : param(param), r(param.seed) -{ - max_sketch_size_ = - pow(2, 2 * 2 + 1) * 4. * param.k * (1. + log(param.sliding * 3)) * (1.0 + 1. / 0.5); +MeyersonSketch::MeyersonSketch(const param_t ¶m) + : param(param), r(param.seed) { + max_sketch_size_ = pow(2, 2 * 2 + 1) * 4. * param.k * + (1. + log(param.sliding * 3)) * (1.0 + 1. / 0.5); } void MeyersonSketch::Init() {} -MeyersonSketch::NodePtr MeyersonSketch::Insert(PointPtr input) -{ - if (!has_sampled) - { - if (samples.size() < param.num_samples * param.sliding) samples.push_back(input); - if (samples.size() >= param.num_samples * param.sliding) - { - const auto &[lower_bound, upper_bound] = guess_optimum_range_bounds( - &r, samples, param.sliding, param.num_samples, param.num_clusters); - distance_denominator_ = (upper_bound) / (param.k * (1. + log(param.sliding * 3))); - NodePtr res; - for (auto p : samples) - { - res = Process(p); - } - has_sampled = true; - return res; - } - return nullptr; - } - else - { - return Process(input); +MeyersonSketch::NodePtr MeyersonSketch::Insert(PointPtr input) { + if (!has_sampled) { + if (samples.size() < param.num_samples * param.sliding) + samples.push_back(input); + if (samples.size() >= param.num_samples * param.sliding) { + const auto &[lower_bound, upper_bound] = guess_optimum_range_bounds( + &r, samples, param.sliding, param.num_samples, param.num_clusters); + distance_denominator_ = + (upper_bound) / (param.k * (1. + log(param.sliding * 3))); + NodePtr res; + for (auto p : samples) { + res = Process(p); + } + has_sampled = true; + return res; } + return nullptr; + } else { + return Process(input); + } } -MeyersonSketch::NodePtr MeyersonSketch::Process(PointPtr point) -{ - if (centers.empty()) - { - return CreateCenter(point); - } - auto [node, dist] = CalcClosestNode(centers, point); - bool open_new = r.bernoulli(min(1.0, pow(dist, 2) / distance_denominator_)); - if (open_new) - { - return CreateCenter(point); - } - else - { - node->Update(point); - return node; - } +MeyersonSketch::NodePtr MeyersonSketch::Process(PointPtr point) { + if (centers.empty()) { + return CreateCenter(point); + } + auto [node, dist] = CalcClosestNode(centers, point); + bool open_new = r.bernoulli(min(1.0, pow(dist, 2) / distance_denominator_)); + if (open_new) { + return CreateCenter(point); + } else { + node->Update(point); + return node; + } } -MeyersonSketch::NodePtr MeyersonSketch::CreateCenter(PointPtr input) -{ - if (centers.size() >= max_sketch_size_) - { - return nullptr; - } - auto node = std::make_shared(input); - centers.push_back(node); - return node; +MeyersonSketch::NodePtr MeyersonSketch::CreateCenter(PointPtr input) { + if (centers.size() >= max_sketch_size_) { + return nullptr; + } + auto node = std::make_shared(input); + centers.push_back(node); + return node; } -MeyersonSketch::NodePtr MeyersonSketch::Insert(NodePtr node) -{ - centers.push_back(node); - return node; +MeyersonSketch::NodePtr MeyersonSketch::Insert(NodePtr node) { + centers.push_back(node); + return node; } -void MeyersonSketch::Remove(NodePtr node) -{ - auto it = std::find(centers.begin(), centers.end(), node); - if (it != centers.end()) - { - centers.erase(it); - } +void MeyersonSketch::Remove(NodePtr node) { + auto it = std::find(centers.begin(), centers.end(), node); + if (it != centers.end()) { + centers.erase(it); + } } -std::vector &MeyersonSketch::clusters() { return centers; } +std::vector &MeyersonSketch::clusters() { + return centers; +} -void MeyersonSketch::ForEach(std::function func) -{ - for (auto node : centers) - { - func(node); - } +void MeyersonSketch::ForEach( + std::function func) { + for (auto node : centers) { + func(node); + } } diff --git a/src/Algorithm/DataStructure/MicroCluster.cpp b/src/Algorithm/DataStructure/MicroCluster.cpp index 16a0e6da..fbb1c432 100644 --- a/src/Algorithm/DataStructure/MicroCluster.cpp +++ b/src/Algorithm/DataStructure/MicroCluster.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by 1124a on 2021/8/16. @@ -8,208 +9,189 @@ #include #include // Create MC, only initialization, used for DenStream, CluStream -SESAME::MicroCluster::MicroCluster(int dim, int id) -{ - this->dim = dim; - weight = 0; - this->id.push_back(id); - LST = 0; - SST = 0; - this->visited = false; - this->createTime = 0; - this->lastUpdateTime = this->createTime; - radius = 0; - visited = false; +SESAME::MicroCluster::MicroCluster(int dim, int id) { + this->dim = dim; + weight = 0; + this->id.push_back(id); + LST = 0; + SST = 0; + this->visited = false; + this->createTime = 0; + this->lastUpdateTime = this->createTime; + radius = 0; + visited = false; } -// Create MC, only initialization, only used for DBStream as it has user-defined fixed radius -SESAME::MicroCluster::MicroCluster(int dim, int id, PointPtr dataPoint, double radius) -{ - this->dim = dim; - weight = 1; - this->id.push_back(id); - LST = 0; - SST = 0; - this->visited = false; - this->createTime = 0; - this->lastUpdateTime = this->createTime; - this->radius = radius; - - for (int i = 0; i < this->dim; i++) - { - double data = dataPoint->feature[i]; - LS.push_back(data); - centroid.push_back(data); - } +// Create MC, only initialization, only used for DBStream as it has user-defined +// fixed radius +SESAME::MicroCluster::MicroCluster(int dim, int id, PointPtr dataPoint, + double radius) { + this->dim = dim; + weight = 1; + this->id.push_back(id); + LST = 0; + SST = 0; + this->visited = false; + this->createTime = 0; + this->lastUpdateTime = this->createTime; + this->radius = radius; + + for (int i = 0; i < this->dim; i++) { + double data = dataPoint->feature[i]; + LS.push_back(data); + centroid.push_back(data); + } } // Release memory of the current micro cluster -SESAME::MicroCluster::~MicroCluster() -{ - std::vector().swap(id); - std::vector().swap(centroid); - std::vector().swap(LS); - std::vector().swap(SS); +SESAME::MicroCluster::~MicroCluster() { + std::vector().swap(id); + std::vector().swap(centroid); + std::vector().swap(LS); + std::vector().swap(SS); } // Used in DenStream, DBStream -void SESAME::MicroCluster::Init(PointPtr datapoint, int timestamp) -{ - weight++; - - for (int i = 0; i < dim; i++) - { - double data = datapoint->getFeatureItem(i); - LS.push_back(data); - SS.push_back(data * data); - centroid.push_back(data); - } - this->createTime = datapoint->getIndex(); - LST += timestamp; - SST += timestamp * timestamp; +void SESAME::MicroCluster::Init(PointPtr datapoint, int timestamp) { + weight++; + + for (int i = 0; i < dim; i++) { + double data = datapoint->getFeatureItem(i); + LS.push_back(data); + SS.push_back(data * data); + centroid.push_back(data); + } + this->createTime = datapoint->getIndex(); + LST += timestamp; + SST += timestamp * timestamp; } // Used in DenStream, DBStream // insert a new data point from input data stream -void SESAME::MicroCluster::insert(PointPtr datapoint, int timestamp) -{ - weight++; - for (int i = 0; i < LS.size(); i++) - { - double data = datapoint->getFeatureItem(i); - LS[i] += data; - SS[i] += data * data; - } - LST += timestamp; - SST += timestamp * timestamp; - centroid = std::move(getCentroid()); +void SESAME::MicroCluster::insert(PointPtr datapoint, int timestamp) { + weight++; + for (int i = 0; i < LS.size(); i++) { + double data = datapoint->getFeatureItem(i); + LS[i] += data; + SS[i] += data * data; + } + LST += timestamp; + SST += timestamp * timestamp; + centroid = std::move(getCentroid()); } // Used only in DBStream -void SESAME::MicroCluster::insert(PointPtr datapoint) //,double decayFactor +void SESAME::MicroCluster::insert(PointPtr datapoint) //,double decayFactor { - weight++; - double val = exp(-(pow(3 * this->distance / radius, 2) / 2)); - for (int i = 0; i < LS.size(); i++) - { - double data = datapoint->getFeatureItem(i); - LS[i] = centroid.at(i) + val * (data - centroid.at(i)); - } - lastUpdateTime = datapoint->getIndex(); + weight++; + double val = exp(-(pow(3 * this->distance / radius, 2) / 2)); + for (int i = 0; i < LS.size(); i++) { + double data = datapoint->getFeatureItem(i); + LS[i] = centroid.at(i) + val * (data - centroid.at(i)); + } + lastUpdateTime = datapoint->getIndex(); } -double SESAME::MicroCluster::getDistance(PointPtr datapoint) -{ - this->distance = calCentroidDistance(datapoint); - return this->distance; +double SESAME::MicroCluster::getDistance(PointPtr datapoint) { + this->distance = calCentroidDistance(datapoint); + return this->distance; } -// Often Used only in DBStream TODO this just a note, need to delete or detailed explain later -double SESAME::MicroCluster::getDistance(MicroClusterPtr other) -{ - double temp = 0, dist = 0; - for (int i = 0; i < this->dim; i++) - { - temp = this->centroid[i] - other->centroid[i]; - dist += temp * temp; - } - return sqrt(dist); +// Often Used only in DBStream TODO this just a note, need to delete or detailed +// explain later +double SESAME::MicroCluster::getDistance(MicroClusterPtr other) { + double temp = 0, dist = 0; + for (int i = 0; i < this->dim; i++) { + temp = this->centroid[i] - other->centroid[i]; + dist += temp * temp; + } + return sqrt(dist); } // Used in DenStream -bool SESAME::MicroCluster::insert(PointPtr datapoint, double decayFactor, double epsilon) -{ - bool result; - dataPoint LSPre; - LSPre.assign(this->LS.begin(), this->LS.end()); - dataPoint SSPre; - SSPre.assign(this->SS.begin(), this->SS.end()); - for (int i = 0; i < this->dim; i++) - { - double data = datapoint->getFeatureItem(i); - LSPre[i] *= decayFactor; - LSPre[i] += data; - SSPre[i] *= decayFactor; - SSPre[i] += data * data; - } - bool judge; - if (getRadius(decayFactor, judge) < epsilon) - { - LS = LSPre; - SS = SSPre; - weight *= decayFactor; - weight++; - for (int i = 0; i < this->dim; i++) - { - centroid.at(i) = LS.at(i) / weight; - } - this->lastUpdateTime = datapoint->getIndex(); - result = true; +bool SESAME::MicroCluster::insert(PointPtr datapoint, double decayFactor, + double epsilon) { + bool result; + dataPoint LSPre; + LSPre.assign(this->LS.begin(), this->LS.end()); + dataPoint SSPre; + SSPre.assign(this->SS.begin(), this->SS.end()); + for (int i = 0; i < this->dim; i++) { + double data = datapoint->getFeatureItem(i); + LSPre[i] *= decayFactor; + LSPre[i] += data; + SSPre[i] *= decayFactor; + SSPre[i] += data * data; + } + bool judge; + if (getRadius(decayFactor, judge) < epsilon) { + LS = LSPre; + SS = SSPre; + weight *= decayFactor; + weight++; + for (int i = 0; i < this->dim; i++) { + centroid.at(i) = LS.at(i) / weight; } - else - result = false; - return result; + this->lastUpdateTime = datapoint->getIndex(); + result = true; + } else + result = false; + return result; } // merge two micro-clusters -void SESAME::MicroCluster::merge(MicroClusterPtr other) -{ - weight += other->weight; - for (int i = 0; i < dim; i++) // dim can change to CF1x.size() - { - LS[i] += other->LS[i]; - SS[i] += other->SS[i]; - } - LST += other->LST; - SST += other->SST; - updateId(other); - centroid = std::move(getCentroid()); +void SESAME::MicroCluster::merge(MicroClusterPtr other) { + weight += other->weight; + for (int i = 0; i < dim; i++) // dim can change to CF1x.size() + { + LS[i] += other->LS[i]; + SS[i] += other->SS[i]; + } + LST += other->LST; + SST += other->SST; + updateId(other); + centroid = std::move(getCentroid()); } // Calculate the process of micro cluster N(Tc-h') -void SESAME::MicroCluster::subtractClusterVector(MicroClusterPtr other) -{ - this->weight -= other->weight; - for (int i = 0; i < dim; i++) - { - this->LS[i] -= other->LS[i]; - this->SS[i] -= other->SS[i]; - } - this->LST -= other->LST; - this->SST -= other->SST; - centroid = std::move(getCentroid()); +void SESAME::MicroCluster::subtractClusterVector(MicroClusterPtr other) { + this->weight -= other->weight; + for (int i = 0; i < dim; i++) { + this->LS[i] -= other->LS[i]; + this->SS[i] -= other->SS[i]; + } + this->LST -= other->LST; + this->SST -= other->SST; + centroid = std::move(getCentroid()); } -bool SESAME::MicroCluster::judgeMerge(MicroClusterPtr other) -{ - bool merge = true; - for (unsigned int i = 0; i < other->id.size(); i++) - { - if (std::find(this->id.begin(), this->id.end(), other->id[i]) == id.end()) merge = false; - } - return merge; +bool SESAME::MicroCluster::judgeMerge(MicroClusterPtr other) { + bool merge = true; + for (unsigned int i = 0; i < other->id.size(); i++) { + if (std::find(this->id.begin(), this->id.end(), other->id[i]) == id.end()) + merge = false; + } + return merge; } // update id list of Micro cluster -void SESAME::MicroCluster::updateId(MicroClusterPtr other) -{ - for (unsigned int i = 0; i < other->id.size(); i++) - { - this->id.push_back(other->id[i]); - } - std::vector().swap(other->id); - this->id.reserve(20); +void SESAME::MicroCluster::updateId(MicroClusterPtr other) { + for (unsigned int i = 0; i < other->id.size(); i++) { + this->id.push_back(other->id[i]); + } + std::vector().swap(other->id); + this->id.reserve(20); } // reset the unique id in MICRO CLUSTER OF DENSTREAM -void SESAME::MicroCluster::resetID(int index) -{ - this->id.pop_back(); - this->id.push_back(index); +void SESAME::MicroCluster::resetID(int index) { + this->id.pop_back(); + this->id.push_back(index); } // obtain relevance stamp of a cluster to judge whether it needs to be deleted -double SESAME::MicroCluster::getRelevanceStamp(int num_last_arr) const -{ - if (weight < (2 * num_last_arr)) return getMutime(); - return getMutime() + getSigmaTime() * getQuantile(((double)num_last_arr) / (2 * weight)); +double SESAME::MicroCluster::getRelevanceStamp(int num_last_arr) const { + if (weight < (2 * num_last_arr)) + return getMutime(); + return getMutime() + + getSigmaTime() * getQuantile(((double)num_last_arr) / (2 * weight)); } // mean_timestamp @@ -218,163 +200,151 @@ double SESAME::MicroCluster::getMutime() const { return LST / weight; } // standard_deviation_timestamp -double SESAME::MicroCluster::getSigmaTime() const -{ - return sqrt(SST / weight - (LST / weight) * (LST / weight)); +double SESAME::MicroCluster::getSigmaTime() const { + return sqrt(SST / weight - (LST / weight) * (LST / weight)); } -double SESAME::MicroCluster::getQuantile(double z) -{ - assert(z >= 0 && z <= 1); - return sqrt(2) * inverseError(2 * z - 1); +double SESAME::MicroCluster::getQuantile(double z) { + assert(z >= 0 && z <= 1); + return sqrt(2) * inverseError(2 * z - 1); } -double SESAME::MicroCluster::getRadius(double radius) -{ - if (weight == 1) return 0; - if (radius <= 0) radius = 1.8; +double SESAME::MicroCluster::getRadius(double radius) { + if (weight == 1) + return 0; + if (radius <= 0) + radius = 1.8; - return getDeviation() * radius; + return getDeviation() * radius; } // Calculate the radius of the current micro cluster in DenStream -double SESAME::MicroCluster::getRadius(double decayFactor, bool judge) -{ - double radius = 0; - for (int i = 0; i < this->dim; i++) - { - radius += (SS.at(i) - (pow(LS.at(i), 2) / (weight * decayFactor + 1))); - } - judge = true; - return sqrt(radius / weight); +double SESAME::MicroCluster::getRadius(double decayFactor, bool judge) { + double radius = 0; + for (int i = 0; i < this->dim; i++) { + radius += (SS.at(i) - (pow(LS.at(i), 2) / (weight * decayFactor + 1))); + } + judge = true; + return sqrt(radius / weight); } // calculate RMS deviation very confused -double SESAME::MicroCluster::getDeviation() -{ - SESAME::dataPoint variance = getVarianceVector(); - double sumOfDeviation = 0; - for (int i = 0; i < dim; i++) - { - sumOfDeviation += sqrt(variance[i]); - } +double SESAME::MicroCluster::getDeviation() { + SESAME::dataPoint variance = getVarianceVector(); + double sumOfDeviation = 0; + for (int i = 0; i < dim; i++) { + sumOfDeviation += sqrt(variance[i]); + } - return sumOfDeviation / dim; + return sumOfDeviation / dim; } // calculate centroid of a cluster -SESAME::dataPoint SESAME::MicroCluster::getCentroid() -{ - if (weight == 1) return LS; - dataPoint dataObject(LS.size()); // double - if (weight > 1) - { - for (int i = 0; i < centroid.size(); i++) - { - dataObject[i] = (LS[i] / weight); - } +SESAME::dataPoint SESAME::MicroCluster::getCentroid() { + if (weight == 1) + return LS; + dataPoint dataObject(LS.size()); // double + if (weight > 1) { + for (int i = 0; i < centroid.size(); i++) { + dataObject[i] = (LS[i] / weight); } - return dataObject; + } + return dataObject; } // calculate centroid of a cluster -SESAME::PointPtr SESAME::MicroCluster::getCenter() -{ - PointPtr center = DataStructureFactory::createPoint(dim); - for (int i = 0; i < centroid.size(); i++) center->setFeatureItem(LS[i] / weight, i); - return center; +SESAME::PointPtr SESAME::MicroCluster::getCenter() { + PointPtr center = GenericFactory::New(dim); + for (int i = 0; i < centroid.size(); i++) + center->setFeatureItem(LS[i] / weight, i); + return center; } -double SESAME::MicroCluster::getInclusionProbability(PointPtr datapoint, double radius) -{ - if (weight == 1) - { - double distance = 0; - for (int i = 0; i < dim; i++) - { - double d = LS[i] - datapoint->getFeatureItem(i); - distance += d * d; - } - distance = sqrt(distance); - if (distance < EPSILON) return 1; - return 0; +double SESAME::MicroCluster::getInclusionProbability(PointPtr datapoint, + double radius) { + if (weight == 1) { + double distance = 0; + for (int i = 0; i < dim; i++) { + double d = LS[i] - datapoint->getFeatureItem(i); + distance += d * d; } + distance = sqrt(distance); + if (distance < EPSILON) + return 1; + return 0; + } else { + double dist = calCentroidDistance(datapoint); + if (dist <= getRadius(radius)) + return 0; else - { - double dist = calCentroidDistance(datapoint); - if (dist <= getRadius(radius)) - return 0; - else - return 1; - } + return 1; + } } // Calculate the standard deviation of vector in micro clusters -SESAME::dataPoint SESAME::MicroCluster::getVarianceVector() -{ - dataPoint datapoint; - - for (int i = 0; i < dim; i++) - { - double linearSum = LS[i]; - double squaredSum = SS[i]; - double aveLinearSum = linearSum / weight; - double squaredAveLinearSum = aveLinearSum * aveLinearSum; - double squaredSumSquared = squaredSum / weight; - datapoint.push_back(squaredSumSquared - squaredAveLinearSum); - if (datapoint[i] <= 0.0) datapoint[i] = MIN_VARIANCE; - } - return datapoint; +SESAME::dataPoint SESAME::MicroCluster::getVarianceVector() { + dataPoint datapoint; + + for (int i = 0; i < dim; i++) { + double linearSum = LS[i]; + double squaredSum = SS[i]; + double aveLinearSum = linearSum / weight; + double squaredAveLinearSum = aveLinearSum * aveLinearSum; + double squaredSumSquared = squaredSum / weight; + datapoint.push_back(squaredSumSquared - squaredAveLinearSum); + if (datapoint[i] <= 0.0) + datapoint[i] = MIN_VARIANCE; + } + return datapoint; } -// calculate the distance between input data point and centroid of micro-clusters - -double SESAME::MicroCluster::calCentroidDistance(PointPtr datapoint) -{ - double temp = 0; - double diff = 0; - for (int i = 0; i < dim; i++) - { - diff = centroid[i] - datapoint->getFeatureItem(i); - temp += (diff * diff); - } - return sqrt(temp); +// calculate the distance between input data point and centroid of +// micro-clusters + +double SESAME::MicroCluster::calCentroidDistance(PointPtr datapoint) { + double temp = 0; + double diff = 0; + for (int i = 0; i < dim; i++) { + diff = centroid[i] - datapoint->getFeatureItem(i); + temp += (diff * diff); + } + return sqrt(temp); } // Still need to modify void SESAME::MicroCluster::move() { this->centroid = this->LS; } -void SESAME::MicroCluster::decayWeight(double decayFactor) { this->weight *= decayFactor; } -double SESAME::MicroCluster::inverseError(double x) -{ - double z = sqrt(M_PI) * x; - double res = (z) / 2; +void SESAME::MicroCluster::decayWeight(double decayFactor) { + this->weight *= decayFactor; +} +double SESAME::MicroCluster::inverseError(double x) { + double z = sqrt(M_PI) * x; + double res = (z) / 2; - double z2 = z * z; - double zProd = z * z2; // z^3 - res += (1.0 / 24) * zProd; + double z2 = z * z; + double zProd = z * z2; // z^3 + res += (1.0 / 24) * zProd; - zProd *= z2; // z^5 - res += (7.0 / 960) * zProd; + zProd *= z2; // z^5 + res += (7.0 / 960) * zProd; - zProd *= z2; // z^7 - res += (127 * zProd) / 80640; + zProd *= z2; // z^7 + res += (127 * zProd) / 80640; - zProd *= z2; // z^9 - res += (4369 * zProd) / 11612160; + zProd *= z2; // z^9 + res += (4369 * zProd) / 11612160; - zProd *= z2; // z^11 - res += (34807 * zProd) / 364953600; + zProd *= z2; // z^11 + res += (34807 * zProd) / 364953600; - zProd *= z2; // z^13 - res += (20036983 * zProd) / 797058662400; + zProd *= z2; // z^13 + res += (20036983 * zProd) / 797058662400; - return res; + return res; } -SESAME::MicroClusterPtr SESAME::MicroCluster::copy() -{ - return std::make_shared(*this); +SESAME::MicroClusterPtr SESAME::MicroCluster::copy() { + return std::make_shared(*this); } diff --git a/src/Algorithm/DataStructure/OutlierResevoir.cpp b/src/Algorithm/DataStructure/OutlierResevoir.cpp index 1aa90479..1f3e69a2 100644 --- a/src/Algorithm/DataStructure/OutlierResevoir.cpp +++ b/src/Algorithm/DataStructure/OutlierResevoir.cpp @@ -10,75 +10,76 @@ /** * OutlierReservoir */ -SESAME::OutlierReservoir::OutlierReservoir() = default; +SESAME::OutlierReservoir::OutlierReservoir() = default; SESAME::OutlierReservoir::~OutlierReservoir() = default; double SESAME::OutlierReservoir::GetR() { return r; } void SESAME::OutlierReservoir::SetR(double r) { OutlierReservoir::r = r; } double SESAME::OutlierReservoir::GetTimeGap() { return timeGap; } -void SESAME::OutlierReservoir::SetTimeGap(double time_gap) { timeGap = time_gap; } +void SESAME::OutlierReservoir::SetTimeGap(double time_gap) { + timeGap = time_gap; +} long SESAME::OutlierReservoir::GetLastDelTime() { return lastDelTime; } -void SESAME::OutlierReservoir::SetLastDelTime(long last_del_time) { lastDelTime = last_del_time; } +void SESAME::OutlierReservoir::SetLastDelTime(long last_del_time) { + lastDelTime = last_del_time; +} double SESAME::OutlierReservoir::GetA() { return a; } void SESAME::OutlierReservoir::SetA(double a) { OutlierReservoir::a = a; } double SESAME::OutlierReservoir::GetLamd() { return lamd; } -void SESAME::OutlierReservoir::SetLamd(double lamd) { OutlierReservoir::lamd = lamd; } -std::unordered_set &SESAME::OutlierReservoir::getOutliers() { return outliers; } -void SESAME::OutlierReservoir::setOutliers(std::unordered_set &outliers) -{ - OutlierReservoir::outliers = outliers; +void SESAME::OutlierReservoir::SetLamd(double lamd) { + OutlierReservoir::lamd = lamd; } -SESAME::OutlierReservoir::OutlierReservoir(double r, double a, double lamd) -{ - this->r = r; - this->a = a; - this->lamd = lamd; +std::unordered_set &SESAME::OutlierReservoir::getOutliers() { + return outliers; } -void SESAME::OutlierReservoir::setTimeGap(double timeGap) { this->timeGap = timeGap; } -void SESAME::OutlierReservoir::insert(SESAME::DPNodePtr &c) -{ - c->SetDelta(DBL_MAX); - if (c->GetDep() != nullptr) - { - std::unordered_set successors = c->GetDep()->GetSucs(); - successors.erase(c); - } - this->outliers.insert(c); +void SESAME::OutlierReservoir::setOutliers( + std::unordered_set &outliers) { + OutlierReservoir::outliers = outliers; +} +SESAME::OutlierReservoir::OutlierReservoir(double r, double a, double lamd) { + this->r = r; + this->a = a; + this->lamd = lamd; +} +void SESAME::OutlierReservoir::setTimeGap(double timeGap) { + this->timeGap = timeGap; } -SESAME::DPNodePtr SESAME::OutlierReservoir::insert(SESAME::PointPtr &p, double time) -{ - double dis = 0; - auto minDis = DBL_MAX; - SESAME::DPNodePtr nn = nullptr; - for (auto it = this->outliers.begin(); it != this->outliers.end();) - { - if (time - double(it->get()->GetLastTime()) > this->timeGap) - { - this->outliers.erase(it++); - continue; - } - else - { - dis = p->L2Dist(it->get()->GetCenter()); - if (dis < minDis) - { - minDis = dis; - nn = it->get()->copy(); - } - it++; - } +void SESAME::OutlierReservoir::insert(SESAME::DPNodePtr &c) { + c->SetDelta(DBL_MAX); + if (c->GetDep() != nullptr) { + std::unordered_set successors = c->GetDep()->GetSucs(); + successors.erase(c); + } + this->outliers.insert(c); +} +SESAME::DPNodePtr SESAME::OutlierReservoir::insert(SESAME::PointPtr &p, + double time) { + double dis = 0; + auto minDis = DBL_MAX; + SESAME::DPNodePtr nn = nullptr; + for (auto it = this->outliers.begin(); it != this->outliers.end();) { + if (time - double(it->get()->GetLastTime()) > this->timeGap) { + this->outliers.erase(it++); + continue; + } else { + dis = p->L2Dist(it->get()->GetCenter()); + if (dis < minDis) { + minDis = dis; + nn = it->get()->copy(); + } + it++; } + } - if (nn == nullptr || minDis > r) - { - SESAME::DPNodePtr c = std::make_shared(p, time); - this->outliers.insert(c); - return c; - } - else - { - double coef = pow(a, lamd * (time - nn->GetLastTime())); - nn->add(coef, time); - return nn; - } + if (nn == nullptr || minDis > r) { + SESAME::DPNodePtr c = std::make_shared(p, time); + this->outliers.insert(c); + return c; + } else { + double coef = pow(a, lamd * (time - nn->GetLastTime())); + nn->add(coef, time); + return nn; + } +} +void SESAME::OutlierReservoir::remove(SESAME::DPNodePtr &nn) { + this->outliers.erase(nn); } -void SESAME::OutlierReservoir::remove(SESAME::DPNodePtr &nn) { this->outliers.erase(nn); } diff --git a/src/Algorithm/DataStructure/Point.cpp b/src/Algorithm/DataStructure/Point.cpp index d98f12a7..a1734664 100644 --- a/src/Algorithm/DataStructure/Point.cpp +++ b/src/Algorithm/DataStructure/Point.cpp @@ -7,23 +7,18 @@ #include "Algorithm/DataStructure/Point.hpp" -#include #include #include +#include +#include -namespace SESAME -{ - -Point::Point(int dim, int index, double weight, double cost, int timestamp) - : feature((dim + 3) / 4 * 4, 0.0) -{ - this->index = index; - this->weight = weight; - this->dim = dim; - this->cost = cost; - this->clu_id = -1; - this->timestamp = timestamp; - this->outlier = false; +namespace SESAME { + +Point::Point(uint32 dim, uint64 index, feature_t *feature) + : dim(dim), index(index), feature((dim + 3) / 4 * 4, 0.0) { + if (feature != nullptr) { + memcpy(this->feature.data(), feature, dim * sizeof(feature_t)); + } } int Point::getIndex() const { return this->index; } @@ -36,20 +31,14 @@ void Point::setWeight(double weight) { this->weight = weight; } double Point::getFeatureItem(int index) const { return this->feature[index]; } -void Point::setFeatureItem(double feature, int index) { this->feature[index] = feature; } +void Point::setFeatureItem(double feature, int index) { + this->feature[index] = feature; +} int Point::getClusteringCenter() const { return this->clu_id; } void Point::setClusteringCenter(int index) { this->clu_id = index; } -void Point::setCost(double c) { this->cost = c; } - -double Point::getCost() const { return this->cost; } - -void Point::setTimeStamp(int t) { this->timestamp = t; } - -int Point::getTimeStamp() const { return this->timestamp; } - /** * @param source */ @@ -63,71 +52,66 @@ double Point::getMinDist() const { return min_dist; } void Point::setMinDist(double min_dist) { min_dist = min_dist; } -double Point::L1Dist(PointPtr centroid) -{ - static const auto mask = _mm256_set1_pd(0x7fffffff); - double sum = 0; - const int dim = getDimension(); - auto a = feature.data(), b = centroid->feature.data(); - auto sum_v = _mm256_setzero_pd(); - for (size_t i = 0; i < dim; i += 4) - { - __m256d diff = _mm256_sub_pd(_mm256_loadu_pd(a + i), _mm256_loadu_pd(b + i)); - auto abs_v = _mm256_and_pd(diff, mask); - sum_v = _mm256_add_pd(sum_v, abs_v); - } - double v[4]; - _mm256_storeu_pd(v, sum_v); - sum += v[0] + v[1] + v[2] + v[3]; - return sum; +double Point::L1Dist(PointPtr centroid) { + static const auto mask = _mm256_set1_pd(0x7fffffff); + double sum = 0; + const int dim = getDimension(); + auto a = feature.data(), b = centroid->feature.data(); + auto sum_v = _mm256_setzero_pd(); + for (size_t i = 0; i < dim; i += 4) { + __m256d diff = + _mm256_sub_pd(_mm256_loadu_pd(a + i), _mm256_loadu_pd(b + i)); + auto abs_v = _mm256_and_pd(diff, mask); + sum_v = _mm256_add_pd(sum_v, abs_v); + } + double v[4]; + _mm256_storeu_pd(v, sum_v); + sum += v[0] + v[1] + v[2] + v[3]; + return sum; } -double Point::L2Dist(PointPtr centroid) -{ - double sum = 0.0; - const int dim = getDimension(); - auto a = feature.data(), b = centroid->feature.data(); - auto sum_v = _mm256_setzero_pd(); - for (size_t i = 0; i < dim; i += 4) - { - __m256d diff = _mm256_sub_pd(_mm256_loadu_pd(a + i), _mm256_loadu_pd(b + i)); - __m256d square = _mm256_mul_pd(diff, diff); - sum_v = _mm256_add_pd(sum_v, square); - } - double v[4]; - _mm256_storeu_pd(v, sum_v); - sum += v[0] + v[1] + v[2] + v[3]; - // for (int i = 0; i < dim; i++) { - // #ifndef NDEBUG - // assert(std::isnan(centroid->getFeatureItem(i)) == false); - // assert(std::isnan(getFeatureItem(i)) == false); - // #endif - // auto val = a[i] - b[i]; - // sum += val * val; - // } - return sqrt(sum); +double Point::L2Dist(PointPtr centroid) { + double sum = 0.0; + const int dim = getDimension(); + auto a = feature.data(), b = centroid->feature.data(); + auto sum_v = _mm256_setzero_pd(); + for (size_t i = 0; i < dim; i += 4) { + __m256d diff = + _mm256_sub_pd(_mm256_loadu_pd(a + i), _mm256_loadu_pd(b + i)); + __m256d square = _mm256_mul_pd(diff, diff); + sum_v = _mm256_add_pd(sum_v, square); + } + double v[4]; + _mm256_storeu_pd(v, sum_v); + sum += v[0] + v[1] + v[2] + v[3]; + // for (int i = 0; i < dim; i++) { + // #ifndef NDEBUG + // assert(std::isnan(centroid->getFeatureItem(i)) == false); + // assert(std::isnan(getFeatureItem(i)) == false); + // #endif + // auto val = a[i] - b[i]; + // sum += val * val; + // } + return sqrt(sum); } void SESAME::Point::setOutlier(bool flag) { this->outlier = flag; } bool SESAME::Point::getOutlier() { return this->outlier; } -PointPtr Point::Reverse() -{ - auto res = copy(); - res->sgn = -res->sgn; - return res; +PointPtr Point::Reverse() { + auto res = copy(); + res->sgn = -res->sgn; + return res; } -std::string Point::Serialize() -{ - std::string str = "#" + std::to_string(index) + " " + std::to_string(dim); - for (int i = 0; i < dim; i++) - { - str += "," + std::to_string(feature.at(i)); - } - return str; +std::string Point::Serialize() { + std::string str = "#" + std::to_string(index) + " " + std::to_string(dim); + for (int i = 0; i < dim; i++) { + str += "," + std::to_string(feature.at(i)); + } + return str; } void Point::Debug() { std::cerr << Serialize() << std::endl; } -} // namespace SESAME +} // namespace SESAME diff --git a/src/Algorithm/DataStructure/Snapshot.cpp b/src/Algorithm/DataStructure/Snapshot.cpp index 9795467f..36d7b8f0 100644 --- a/src/Algorithm/DataStructure/Snapshot.cpp +++ b/src/Algorithm/DataStructure/Snapshot.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by 1124a on 2021/8/16. @@ -8,100 +9,95 @@ #include #include -SESAME::Snapshot::Snapshot(MicroClusters& otherMicroClusters, int elapsedTime) -{ - this->elapsedTime = elapsedTime; - for (int a = 0; a < otherMicroClusters.size(); a++) - { - this->microClusters.push_back(otherMicroClusters[a]->copy()); - } +SESAME::Snapshot::Snapshot(MicroClusters &otherMicroClusters, int elapsedTime) { + this->elapsedTime = elapsedTime; + for (int a = 0; a < otherMicroClusters.size(); a++) { + this->microClusters.push_back(otherMicroClusters[a]->copy()); + } } -SESAME::Snapshot::~Snapshot() { std::vector().swap(this->microClusters); } -SESAME::SnapshotPtr SESAME::Snapshot::findSnapshot(QueueOrderSnapshot orderSnapShots, - int landmarkTime, int currentElapsedTime, - unsigned int currentOrder) -{ - int i = 0; - int minDistance = currentElapsedTime; - int dist, tempMinDistance = -1, elapsedTimeSnapshot; - int finalOrder, indexSnapshot; // the index of found snapshotPtr - // Found snapshotPtr - while (i <= currentOrder) - { - for (int a = 0; a < orderSnapShots[i].size(); a++) - { - elapsedTimeSnapshot = orderSnapShots[i][a]->elapsedTime; - dist = abs((int)(elapsedTimeSnapshot - landmarkTime)); - if (minDistance > dist || - (minDistance == dist && tempMinDistance < elapsedTimeSnapshot)) // - { - minDistance = dist; - tempMinDistance = elapsedTimeSnapshot; - finalOrder = i; - indexSnapshot = a; - // SESAME_INFO(" this one Elapsed Time is "<< elapsedTimeSnapshot<<",elapsed - // time is "<().swap(this->microClusters); +} +SESAME::SnapshotPtr +SESAME::Snapshot::findSnapshot(QueueOrderSnapshot orderSnapShots, + int landmarkTime, int currentElapsedTime, + unsigned int currentOrder) { + int i = 0; + int minDistance = currentElapsedTime; + int dist, tempMinDistance = -1, elapsedTimeSnapshot; + int finalOrder, indexSnapshot; // the index of found snapshotPtr + // Found snapshotPtr + while (i <= currentOrder) { + for (int a = 0; a < orderSnapShots[i].size(); a++) { + elapsedTimeSnapshot = orderSnapShots[i][a]->elapsedTime; + dist = abs((int)(elapsedTimeSnapshot - landmarkTime)); + if (minDistance > dist || + (minDistance == dist && tempMinDistance < elapsedTimeSnapshot)) // + { + minDistance = dist; + tempMinDistance = elapsedTimeSnapshot; + finalOrder = i; + indexSnapshot = a; + // SESAME_INFO(" this one Elapsed Time is "<< + // elapsedTimeSnapshot<<",elapsed time is "<microClusters, - orderSnapShots[finalOrder][indexSnapshot]->elapsedTime); - // SESAME_INFO("close landmark "<< tempMinDistance); - // SESAME_INFO("just for contrast : "<< orderSnapShots[finalOrder][indexSnapshot]->elapsedTime); - return nearestSnapshot; + SnapshotPtr nearestSnapshot = DataStructureFactory::createSnapshot( + orderSnapShots[finalOrder][indexSnapshot]->microClusters, + orderSnapShots[finalOrder][indexSnapshot]->elapsedTime); + // SESAME_INFO("close landmark "<< tempMinDistance); + // SESAME_INFO("just for contrast : "<< + // orderSnapShots[finalOrder][indexSnapshot]->elapsedTime); + return nearestSnapshot; } -SESAME::SnapshotPtr SESAME::Snapshot::substractSnapshot(SnapshotPtr snapshotCurrent, - const SnapshotPtr& snapshotLandmark, - unsigned int num_clusters) -{ - // SESAME_INFO("Start substract "<elapsedTime<<" LANDMARK ET - // "<elapsedTime); - for (unsigned int i = 0; i < num_clusters; i++) - { // If the micro cluster raised from merging - if (snapshotCurrent->microClusters[i]->id.size() > 1) - { - for (unsigned int j = 0; j < num_clusters; j++) - { - if (snapshotLandmark->microClusters[j]->id.size() > 1) - { - if (snapshotCurrent->microClusters[i]->judgeMerge( - snapshotLandmark->microClusters[j])) - snapshotCurrent->microClusters[i]->subtractClusterVector( - snapshotLandmark->microClusters[j]); - } - else - { - int clusterIdLandmark; - clusterIdLandmark = snapshotLandmark->microClusters[j]->id[0]; - if (std::find(snapshotCurrent->microClusters[i]->id.begin(), - snapshotCurrent->microClusters[i]->id.end(), - clusterIdLandmark) != snapshotCurrent->microClusters[i]->id.end()) - snapshotCurrent->microClusters[i]->subtractClusterVector( - snapshotLandmark->microClusters[j]); - } - } +SESAME::SnapshotPtr +SESAME::Snapshot::substractSnapshot(SnapshotPtr snapshotCurrent, + const SnapshotPtr &snapshotLandmark, + unsigned int num_clusters) { + // SESAME_INFO("Start substract "<elapsedTime<<" LANDMARK + // ET + // "<elapsedTime); + for (unsigned int i = 0; i < num_clusters; + i++) { // If the micro cluster raised from merging + if (snapshotCurrent->microClusters[i]->id.size() > 1) { + for (unsigned int j = 0; j < num_clusters; j++) { + if (snapshotLandmark->microClusters[j]->id.size() > 1) { + if (snapshotCurrent->microClusters[i]->judgeMerge( + snapshotLandmark->microClusters[j])) + snapshotCurrent->microClusters[i]->subtractClusterVector( + snapshotLandmark->microClusters[j]); + } else { + int clusterIdLandmark; + clusterIdLandmark = snapshotLandmark->microClusters[j]->id[0]; + if (std::find(snapshotCurrent->microClusters[i]->id.begin(), + snapshotCurrent->microClusters[i]->id.end(), + clusterIdLandmark) != + snapshotCurrent->microClusters[i]->id.end()) + snapshotCurrent->microClusters[i]->subtractClusterVector( + snapshotLandmark->microClusters[j]); } - // The micro cluster raised from creating new ones or the original ones - else - { - for (unsigned int j = 0; j < num_clusters; j++) - { - if (snapshotLandmark->microClusters[j]->id.size() == 1) - { - int clusterIdLandmark = snapshotLandmark->microClusters[j]->id[0]; - if (snapshotCurrent->microClusters[i]->id[0] == clusterIdLandmark) - snapshotCurrent->microClusters[i]->subtractClusterVector( - snapshotLandmark->microClusters[j]); - } - } + } + } + // The micro cluster raised from creating new ones or the original ones + else { + for (unsigned int j = 0; j < num_clusters; j++) { + if (snapshotLandmark->microClusters[j]->id.size() == 1) { + int clusterIdLandmark = snapshotLandmark->microClusters[j]->id[0]; + if (snapshotCurrent->microClusters[i]->id[0] == clusterIdLandmark) + snapshotCurrent->microClusters[i]->subtractClusterVector( + snapshotLandmark->microClusters[j]); } + } } + } - return snapshotCurrent; + return snapshotCurrent; +} +SESAME::SnapshotPtr SESAME::Snapshot::copy() { + return std::make_shared(*this); } -SESAME::SnapshotPtr SESAME::Snapshot::copy() { return std::make_shared(*this); } diff --git a/src/Algorithm/DataStructure/TreeNode.cpp b/src/Algorithm/DataStructure/TreeNode.cpp index 9c5d1b2b..81f03ee2 100644 --- a/src/Algorithm/DataStructure/TreeNode.cpp +++ b/src/Algorithm/DataStructure/TreeNode.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 26/07/2021. diff --git a/src/Algorithm/DataStructure/WeightedAdjacencyList.cpp b/src/Algorithm/DataStructure/WeightedAdjacencyList.cpp index 18605e34..5d9eba81 100644 --- a/src/Algorithm/DataStructure/WeightedAdjacencyList.cpp +++ b/src/Algorithm/DataStructure/WeightedAdjacencyList.cpp @@ -3,37 +3,32 @@ // #include -SESAME::AdjustedWeight::AdjustedWeight(double weight, int pointTime, timespec pointTime0) -{ - this->weight = weight; - this->updateTime = pointTime; - this->updateTime0 = pointTime0; +SESAME::AdjustedWeight::AdjustedWeight(double weight, int pointTime, + timespec pointTime0) { + this->weight = weight; + this->updateTime = pointTime; + this->updateTime0 = pointTime0; } -void SESAME::AdjustedWeight::add(timespec startTime, double decayValue) -{ - long elapsedTime = ((updateTime0).tv_sec * 1000000L + (updateTime0).tv_nsec / 1000L) - - ((startTime).tv_sec * 1000000L + (startTime).tv_nsec / 1000L); - if (elapsedTime == 0) - { - weight++; - } - else - { - weight *= decayValue + 1; - this->updateTime0 = startTime; - } +void SESAME::AdjustedWeight::add(timespec startTime, double decayValue) { + long elapsedTime = + ((updateTime0).tv_sec * 1000000L + (updateTime0).tv_nsec / 1000L) - + ((startTime).tv_sec * 1000000L + (startTime).tv_nsec / 1000L); + if (elapsedTime == 0) { + weight++; + } else { + weight *= decayValue + 1; + this->updateTime0 = startTime; + } } -void SESAME::AdjustedWeight::add(int startTime, double decayValue) -{ - if (startTime == this->updateTime) - { - weight++; - } - else - { - weight = weight * decayValue + 1; - this->updateTime = startTime; - } +void SESAME::AdjustedWeight::add(int startTime, double decayValue) { + if (startTime == this->updateTime) { + weight++; + } else { + weight = weight * decayValue + 1; + this->updateTime = startTime; + } } -double SESAME::AdjustedWeight::getCurrentWeight(double decayFactor) { return weight * decayFactor; } +double SESAME::AdjustedWeight::getCurrentWeight(double decayFactor) { + return weight * decayFactor; +} diff --git a/src/Algorithm/DenStream.cpp b/src/Algorithm/DenStream.cpp index ef40aa1c..4b0fc504 100644 --- a/src/Algorithm/DenStream.cpp +++ b/src/Algorithm/DenStream.cpp @@ -6,277 +6,256 @@ #include "Algorithm/DataStructure/DataStructureFactory.hpp" #include "Algorithm/WindowModel/WindowFactory.hpp" -SESAME::DenStream::DenStream(param_t &cmd_params) -{ - this->param = cmd_params; - this->denStreamParams.num_points = cmd_params.num_points; - this->denStreamParams.dim = cmd_params.dim; - this->denStreamParams.min_points = cmd_params.min_points; - this->denStreamParams.epsilon = cmd_params.epsilon; - this->denStreamParams.base = cmd_params.base; - this->denStreamParams.lambda = cmd_params.lambda; - this->denStreamParams.mu = cmd_params.mu; - this->denStreamParams.beta = cmd_params.beta; - this->denStreamParams.buf_sizeSize = cmd_params.buf_size; +SESAME::DenStream::DenStream(param_t &cmd_params) { + this->param = cmd_params; + this->denStreamParams.num_points = cmd_params.num_points; + this->denStreamParams.dim = cmd_params.dim; + this->denStreamParams.min_points = cmd_params.min_points; + this->denStreamParams.epsilon = cmd_params.epsilon; + this->denStreamParams.base = cmd_params.base; + this->denStreamParams.lambda = cmd_params.lambda; + this->denStreamParams.mu = cmd_params.mu; + this->denStreamParams.beta = cmd_params.beta; + this->denStreamParams.buf_sizeSize = cmd_params.buf_size; } SESAME::DenStream::~DenStream() {} -void SESAME::DenStream::Init(vector &initData) -{ - this->pMicroClusterIndex = -1; - this->oMicroClusterIndex = -1; - for (int i = 0; i < denStreamParams.buf_sizeSize; i++) - { - if (initData.at(i)->getClusteringCenter() == noVisited) - { - std::vector pointIndex; - pMicroClusterIndex++; - MicroClusterPtr newMicroCluster = SESAME::DataStructureFactory::createMicroCluster( - denStreamParams.dim, pMicroClusterIndex); - newMicroCluster->Init(initData.at(i), 0); - pointsNearCorePoint(initData, pointIndex, newMicroCluster); - if (newMicroCluster->weight <= this->minWeight) // TODO need to change minweight - { - pMicroClusterIndex--; - for (int index : pointIndex) - { - initData.at(index)->setClusteringCenter(noVisited); - } - } - else - pMicroClusters.push_back(newMicroCluster); +void SESAME::DenStream::Init(vector &initData) { + this->pMicroClusterIndex = -1; + this->oMicroClusterIndex = -1; + for (int i = 0; i < denStreamParams.buf_sizeSize; i++) { + if (initData.at(i)->getClusteringCenter() == noVisited) { + std::vector pointIndex; + pMicroClusterIndex++; + MicroClusterPtr newMicroCluster = + SESAME::DataStructureFactory::createMicroCluster(denStreamParams.dim, + pMicroClusterIndex); + newMicroCluster->Init(initData.at(i), 0); + pointsNearCorePoint(initData, pointIndex, newMicroCluster); + if (newMicroCluster->weight <= + this->minWeight) // TODO need to change minweight + { + pMicroClusterIndex--; + for (int index : pointIndex) { + initData.at(index)->setClusteringCenter(noVisited); } + } else + pMicroClusters.push_back(newMicroCluster); } - SESAME_INFO("NOW PMC number is: " << this->pMicroClusterIndex << " , Init succeed!"); + } + SESAME_INFO("NOW PMC number is: " << this->pMicroClusterIndex + << " , Init succeed!"); } -void SESAME::DenStream::pointsNearCorePoint(vector &initData, std::vector pointIndex, - MicroClusterPtr microCluster) -{ - int size = denStreamParams.buf_sizeSize; - for (int i = 0; i < size; i++) - { - if (initData.at(i)->getClusteringCenter() == noVisited) - { - double dist = microCluster->calCentroidDistance(initData.at(i)); - if (dist < denStreamParams.epsilon) - { - initData[i]->setClusteringCenter(microCluster->id.front()); - microCluster->insert(initData.at(i), 0); - pointIndex.push_back(i); - } - } +void SESAME::DenStream::pointsNearCorePoint(vector &initData, + std::vector pointIndex, + MicroClusterPtr microCluster) { + int size = denStreamParams.buf_sizeSize; + for (int i = 0; i < size; i++) { + if (initData.at(i)->getClusteringCenter() == noVisited) { + double dist = microCluster->calCentroidDistance(initData.at(i)); + if (dist < denStreamParams.epsilon) { + initData[i]->setClusteringCenter(microCluster->id.front()); + microCluster->insert(initData.at(i), 0); + pointIndex.push_back(i); + } } + } } -void SESAME::DenStream::Init() -{ - this->dampedWindow = - WindowFactory::createDampedWindow(denStreamParams.base, denStreamParams.lambda); - this->dbscan = std::make_shared(denStreamParams.min_points, denStreamParams.epsilon); - this->startTime = 0; - this->lastUpdateTime = 0; - this->pointArrivingTime = 0; - this->minWeight = denStreamParams.beta * denStreamParams.mu; - this->Tp = (double)(1 / denStreamParams.lambda) * - (log(minWeight / (minWeight - 1)) / log(denStreamParams.base)); - if (this->Tp > 1000 || this->Tp <= 0) this->Tp = 1; - sum_timer.Tick(); +void SESAME::DenStream::Init() { + this->dampedWindow = WindowFactory::createDampedWindow( + denStreamParams.base, denStreamParams.lambda); + this->dbscan = std::make_shared(denStreamParams.min_points, + denStreamParams.epsilon); + this->startTime = 0; + this->lastUpdateTime = 0; + this->pointArrivingTime = 0; + this->minWeight = denStreamParams.beta * denStreamParams.mu; + this->Tp = (double)(1 / denStreamParams.lambda) * + (log(minWeight / (minWeight - 1)) / log(denStreamParams.base)); + if (this->Tp > 1000 || this->Tp <= 0) + this->Tp = 1; + sum_timer.Tick(); } -void SESAME::DenStream::RunOnline(PointPtr in) -{ - PointPtr input = in; +void SESAME::DenStream::RunOnline(PointPtr in) { + PointPtr input = in; - if (!this->isInitial) - { - ds_timer.Tick(); - input->setClusteringCenter(noVisited); - this->initialBuffer.push_back(input); - if (this->initialBuffer.size() == this->denStreamParams.buf_sizeSize) - { - // Initialize part - Init(this->initialBuffer); - this->isInitial = true; - } - ds_timer.Tock(); + if (!this->isInitial) { + ds_timer.Tick(); + input->setClusteringCenter(noVisited); + this->initialBuffer.push_back(input); + if (this->initialBuffer.size() == this->denStreamParams.buf_sizeSize) { + // Initialize part + Init(this->initialBuffer); + this->isInitial = true; } - else - { - this->pointArrivingTime = input->getIndex(); - merge(input); - int elapsedTime = this->pointArrivingTime - this->lastUpdateTime; - - if (elapsedTime >= this->Tp) - { - ds_timer.Tick(); - // timerMeter.outlierDetectionAccMeasure(); - // SESAME_INFO("Check "<pointArrivingTime = input->getIndex(); + merge(input); + int elapsedTime = this->pointArrivingTime - this->lastUpdateTime; - for (int iter = 0; iter < pMicroClusters.size(); iter++) - { - if (pMicroClusters.at(iter)->weight < minWeight) - { - pMicroClusters.erase(pMicroClusters.begin() + iter); - // SESAME_INFO("NOW PMC number is: " << this->pMicroClusterIndex); - } - } - // timerMeter.outlierDetectionEndMeasure(); + if (elapsedTime >= this->Tp) { + ds_timer.Tick(); + // timerMeter.outlierDetectionAccMeasure(); + // SESAME_INFO("Check "<createTime + this->Tp); - double b = -denStreamParams.lambda * this->Tp; - double Xi = - (pow(denStreamParams.base, a) - 1) / (pow(denStreamParams.base, b) - 1); - // SESAME_INFO("NOW Xi "<weight < Xi) - { - oMicroClusters.erase(oMicroClusters.begin() + iter); - // SESAME_INFO("NOW oMicroClusterIndex number is: " << - // oMicroClusters.size()); - } - } - } - ds_timer.Tock(); + for (int iter = 0; iter < pMicroClusters.size(); iter++) { + if (pMicroClusters.at(iter)->weight < minWeight) { + pMicroClusters.erase(pMicroClusters.begin() + iter); + // SESAME_INFO("NOW PMC number is: " << this->pMicroClusterIndex); + } + } + // timerMeter.outlierDetectionEndMeasure(); - this->lastUpdateTime = this->pointArrivingTime; + if (!oMicroClusters.empty()) { + for (int iter = 0; iter < oMicroClusters.size(); iter++) { + double a = -(denStreamParams.lambda) * + (pointArrivingTime - oMicroClusters.at(iter)->createTime + + this->Tp); + double b = -denStreamParams.lambda * this->Tp; + double Xi = (pow(denStreamParams.base, a) - 1) / + (pow(denStreamParams.base, b) - 1); + // SESAME_INFO("NOW Xi "<weight < Xi) { + oMicroClusters.erase(oMicroClusters.begin() + iter); + // SESAME_INFO("NOW oMicroClusterIndex number is: " << + // oMicroClusters.size()); + } } - this->lastPointTime = this->pointArrivingTime; + } + ds_timer.Tock(); + + this->lastUpdateTime = this->pointArrivingTime; } - lat_timer.Add(input->toa); + this->lastPointTime = this->pointArrivingTime; + } + lat_timer.Add(input->toa); } -void SESAME::DenStream::merge(PointPtr dataPoint) -{ - bool index = false; - if (!this->pMicroClusters.empty()) - { - index = mergeToMicroCluster(dataPoint, this->pMicroClusters); - // std::cout<<"Merge into PMC! "<pMicroClusters.empty()) { + index = mergeToMicroCluster(dataPoint, this->pMicroClusters); + // std::cout<<"Merge into PMC! "<oMicroClusters.empty()) - { - // Time measurement inside the mergeToOMicroCluster function - index = mergeToOMicroCluster(dataPoint, this->oMicroClusters); - // std::cout<<"Merge into OMC! "<oMicroClusters.empty()) { + // Time measurement inside the mergeToOMicroCluster function + index = mergeToOMicroCluster(dataPoint, this->oMicroClusters); + // std::cout<<"Merge into OMC! "<Init(dataPoint, 0); - oMicroClusters.push_back(newOMicroCluster); - } - out_timer.Tock(); + out_timer.Tick(); + if (!index) { + oMicroClusterIndex++; + MicroClusterPtr newOMicroCluster = DataStructureFactory::createMicroCluster( + denStreamParams.dim, oMicroClusterIndex); + newOMicroCluster->Init(dataPoint, 0); + oMicroClusters.push_back(newOMicroCluster); + } + out_timer.Tock(); } -bool SESAME::DenStream::mergeToMicroCluster(PointPtr dataPoint, - std::vector microClusters) -{ - ds_timer.Tick(); - bool index = false; - MicroClusterPtr MC = nearestNeighbor(dataPoint, microClusters); - ds_timer.Tock(); - win_timer.Tick(); - double decayFactor = this->dampedWindow->decayFunction(lastPointTime, pointArrivingTime); - if (MC != NULL && MC->insert(dataPoint, decayFactor, denStreamParams.epsilon)) - { - index = true; - } - win_timer.Tock(); - return index; +bool SESAME::DenStream::mergeToMicroCluster( + PointPtr dataPoint, std::vector microClusters) { + ds_timer.Tick(); + bool index = false; + MicroClusterPtr MC = nearestNeighbor(dataPoint, microClusters); + ds_timer.Tock(); + win_timer.Tick(); + double decayFactor = + this->dampedWindow->decayFunction(lastPointTime, pointArrivingTime); + if (MC != NULL && + MC->insert(dataPoint, decayFactor, denStreamParams.epsilon)) { + index = true; + } + win_timer.Tock(); + return index; } -bool SESAME::DenStream::mergeToOMicroCluster(PointPtr dataPoint, - std::vector microClusters) -{ - out_timer.Tick(); - MicroClusterPtr MC = nearestNeighbor(dataPoint, microClusters); - out_timer.Tock(); - win_timer.Tick(); - double decayFactor = this->dampedWindow->decayFunction(lastPointTime, pointArrivingTime); - if (MC != NULL && MC->insert(dataPoint, decayFactor, denStreamParams.epsilon)) - { - double decayValue = - this->dampedWindow->decayFunction(MC->lastUpdateTime, pointArrivingTime); - win_timer.Tock(); - ds_timer.Tick(); - if (MC->weight * decayValue > minWeight) - { - pMicroClusterIndex++; - MC->resetID(pMicroClusterIndex); - pMicroClusters.push_back(MC); - int index = findIndex(oMicroClusters, MC); - oMicroClusters.erase(oMicroClusters.begin() + index); - // std::remove(oMicroClusters.begin(), oMicroClusters.end(), MC); - } - ds_timer.Tock(); - return true; - } - else - { - win_timer.Tock(); - return false; +bool SESAME::DenStream::mergeToOMicroCluster( + PointPtr dataPoint, std::vector microClusters) { + out_timer.Tick(); + MicroClusterPtr MC = nearestNeighbor(dataPoint, microClusters); + out_timer.Tock(); + win_timer.Tick(); + double decayFactor = + this->dampedWindow->decayFunction(lastPointTime, pointArrivingTime); + if (MC != NULL && + MC->insert(dataPoint, decayFactor, denStreamParams.epsilon)) { + double decayValue = this->dampedWindow->decayFunction(MC->lastUpdateTime, + pointArrivingTime); + win_timer.Tock(); + ds_timer.Tick(); + if (MC->weight * decayValue > minWeight) { + pMicroClusterIndex++; + MC->resetID(pMicroClusterIndex); + pMicroClusters.push_back(MC); + int index = findIndex(oMicroClusters, MC); + oMicroClusters.erase(oMicroClusters.begin() + index); + // std::remove(oMicroClusters.begin(), oMicroClusters.end(), MC); } + ds_timer.Tock(); + return true; + } else { + win_timer.Tock(); + return false; + } } -int SESAME::DenStream::findIndex(std::vector µClusters, MicroClusterPtr MC) -{ - auto ret = std::find(microClusters.begin(), microClusters.end(), MC); - if (ret != microClusters.end()) return ret - microClusters.begin(); - return -1; +int SESAME::DenStream::findIndex(std::vector µClusters, + MicroClusterPtr MC) { + auto ret = std::find(microClusters.begin(), microClusters.end(), MC); + if (ret != microClusters.end()) + return ret - microClusters.begin(); + return -1; } -SESAME::MicroClusterPtr SESAME::DenStream::nearestNeighbor( - PointPtr dataPoint, std::vector microClusters) -{ - MicroClusterPtr targetMC = NULL; - double dist = 0, minDist = std::numeric_limits::max(); - for (vector::size_type i = 0; i < microClusters.size(); i++) - { - dist = microClusters.at(i)->calCentroidDistance(dataPoint); - if (dist < minDist) - { - minDist = dist; - targetMC = microClusters.at(i); - } +SESAME::MicroClusterPtr +SESAME::DenStream::nearestNeighbor(PointPtr dataPoint, + std::vector microClusters) { + MicroClusterPtr targetMC = NULL; + double dist = 0, minDist = std::numeric_limits::max(); + for (vector::size_type i = 0; i < microClusters.size(); + i++) { + dist = microClusters.at(i)->calCentroidDistance(dataPoint); + if (dist < minDist) { + minDist = dist; + targetMC = microClusters.at(i); } + } - return targetMC; + return targetMC; } -void SESAME::DenStream::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - vector transformedPoints; - std::vector> oldGroups; - microClusterToPoint(pMicroClusters, transformedPoints); - this->dbscan->run(transformedPoints); - this->dbscan->produceResult(transformedPoints, sinkPtr); - for (auto out = this->oMicroClusters.begin(); out != this->oMicroClusters.end(); ++out) - { - PointPtr center = out->get()->getCenter(); - center->setClusteringCenter(-1); - center->setOutlier(true); - sinkPtr->put(center->copy()); - } - ref_timer.Tock(); - sum_timer.Tock(); +void SESAME::DenStream::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + vector transformedPoints; + std::vector> oldGroups; + microClusterToPoint(pMicroClusters, transformedPoints); + this->dbscan->run(transformedPoints); + this->dbscan->produceResult(transformedPoints, sinkPtr); + for (auto out = this->oMicroClusters.begin(); + out != this->oMicroClusters.end(); ++out) { + PointPtr center = out->get()->getCenter(); + center->setClusteringCenter(-1); + center->setOutlier(true); + sinkPtr->put(center->copy()); + } + ref_timer.Tock(); + sum_timer.Tock(); } -void SESAME::DenStream::microClusterToPoint(std::vector µClusters, - vector &points) -{ - for (std::vector::size_type i = 0; i < microClusters.size(); i++) - { - PointPtr point = DataStructureFactory::createPoint(i, microClusters.at(i)->weight, - microClusters.at(i)->centroid.size(), 0); - for (int j = 0; j < microClusters.at(i)->centroid.size(); j++) - point->setFeatureItem(microClusters[i]->centroid[j], j); - points.push_back(point); - } +void SESAME::DenStream::microClusterToPoint( + std::vector µClusters, vector &points) { + for (std::vector::size_type i = 0; i < microClusters.size(); + i++) { + PointPtr point = + GenericFactory::New(microClusters.at(i)->centroid.size(), i); + point->weight = microClusters.at(i)->weight; + for (int j = 0; j < microClusters.at(i)->centroid.size(); j++) + point->setFeatureItem(microClusters[i]->centroid[j], j); + points.push_back(point); + } } diff --git a/src/Algorithm/DesignAspect/V10.cpp b/src/Algorithm/DesignAspect/V10.cpp index d982bb3f..5b6f7efb 100644 --- a/src/Algorithm/DesignAspect/V10.cpp +++ b/src/Algorithm/DesignAspect/V10.cpp @@ -5,227 +5,201 @@ #include #include -SESAME::V10::V10(param_t &cmd_params) -{ - this->param = cmd_params; - this->V10Param.num_points = cmd_params.num_points; - this->V10Param.dim = cmd_params.dim; - this->V10Param.alpha = cmd_params.alpha; - this->V10Param.lamda = 0; - this->V10Param.beta = cmd_params.beta; - this->V10Param.num_cache = cmd_params.num_cache; - this->V10Param.radius = cmd_params.radius; - this->V10Param.minDelta = cmd_params.delta; - this->V10Param.opt = cmd_params.opt; - this->V10Param.landmark = cmd_params.landmark; - sum_timer.Tick(); +SESAME::V10::V10(param_t &cmd_params) { + this->param = cmd_params; + this->V10Param.num_points = cmd_params.num_points; + this->V10Param.dim = cmd_params.dim; + this->V10Param.alpha = cmd_params.alpha; + this->V10Param.lamda = 0; + this->V10Param.beta = cmd_params.beta; + this->V10Param.num_cache = cmd_params.num_cache; + this->V10Param.radius = cmd_params.radius; + this->V10Param.minDelta = cmd_params.delta; + this->V10Param.opt = cmd_params.opt; + this->V10Param.landmark = cmd_params.landmark; + sum_timer.Tick(); } -SESAME::V10::~V10(){}; - -void SESAME::V10::Init() -{ - this->cache = - SESAME::DataStructureFactory::creatCache(this->V10Param.num_cache, this->V10Param.alpha, - this->V10Param.lamda, this->V10Param.radius); - this->outres = SESAME::DataStructureFactory::createOutlierReservoir( - this->V10Param.radius, this->V10Param.alpha, this->V10Param.lamda); - this->dpTree = - SESAME::DataStructureFactory::createDPTree(this->actCluMaxNum, this->V10Param.radius); - this->dpTree->SetMinDelta(this->V10Param.minDelta); +void SESAME::V10::Init() { + this->cache = SESAME::DataStructureFactory::creatCache( + this->V10Param.num_cache, this->V10Param.alpha, this->V10Param.lamda, + this->V10Param.radius); + this->outres = SESAME::DataStructureFactory::createOutlierReservoir( + this->V10Param.radius, this->V10Param.alpha, this->V10Param.lamda); + this->dpTree = SESAME::DataStructureFactory::createDPTree( + this->actCluMaxNum, this->V10Param.radius); + this->dpTree->SetMinDelta(this->V10Param.minDelta); } -void SESAME::V10::OutputOnline(std::vector &output) -{ - ref_timer.Tick(); - auto clu = 0; - for (const auto ¢er : this->onlineCenters) - { - center->setClusteringCenter(clu++); - output.push_back(center->copy()); - } - for (const auto &cluster : this->clusters) - { - std::unordered_set cells = cluster->GetCells(); - for (const auto &cell : cells) - { - PointPtr center = cell->GetCenter(); - center->setClusteringCenter(clu++); - center->setOutlier(false); - output.push_back(center->copy()); - } +void SESAME::V10::OutputOnline(std::vector &output) { + ref_timer.Tick(); + auto clu = 0; + for (const auto ¢er : this->onlineCenters) { + center->setClusteringCenter(clu++); + output.push_back(center->copy()); + } + for (const auto &cluster : this->clusters) { + std::unordered_set cells = cluster->GetCells(); + for (const auto &cell : cells) { + PointPtr center = cell->GetCenter(); + center->setClusteringCenter(clu++); + center->setOutlier(false); + output.push_back(center->copy()); } + } } - -void SESAME::V10::setMinDelta(double minDelta) -{ - this->V10Param.minDelta = minDelta; - this->dpTree->SetMinDelta(minDelta); +void SESAME::V10::setMinDelta(double minDelta) { + this->V10Param.minDelta = minDelta; + this->dpTree->SetMinDelta(minDelta); } -void SESAME::V10::InitDP(double time) -{ - cache->compDeltaRho(time); - this->minRho = this->V10Param.beta; +void SESAME::V10::InitDP(double time) { + cache->compDeltaRho(time); + this->minRho = this->V10Param.beta; - outres->setTimeGap(INT64_MAX); - cache->getDPTree(this->minRho, this->V10Param.minDelta, dpTree, outres, clusters); - dpTree->SetLastTime(time); + outres->setTimeGap(INT64_MAX); + cache->getDPTree(this->minRho, this->V10Param.minDelta, dpTree, outres, + clusters); + dpTree->SetLastTime(time); } -SESAME::DPNodePtr SESAME::V10::streamProcess(SESAME::PointPtr p, int opt, double time) -{ - win_timer.Tick(); - double coef = pow(this->V10Param.alpha, this->V10Param.lamda * (time - dpTree->GetLastTime())); - dpTree->SetLastTime(time); - win_timer.Tock(); - ds_timer.Tick(); - auto nn = dpTree->findNN(p, coef, opt, time); - ds_timer.Tock(); - out_timer.Tick(); - // if (nn == nullptr || nn->GetDis() > dpTree->GetCluR()) - // { - // nn = outres->insert(p, time); - // if (nn->GetRho() > this->minRho) - // { - // outres->remove(nn); - // dpTree->insert(nn, opt); - // } - // } - dpTree->deleteInact(outres, this->minRho, time); - out_timer.Tock(); - return nn; +SESAME::DPNodePtr SESAME::V10::streamProcess(SESAME::PointPtr p, int opt, + double time) { + win_timer.Tick(); + double coef = pow(this->V10Param.alpha, + this->V10Param.lamda * (time - dpTree->GetLastTime())); + dpTree->SetLastTime(time); + win_timer.Tock(); + ds_timer.Tick(); + auto nn = dpTree->findNN(p, coef, opt, time); + ds_timer.Tock(); + out_timer.Tick(); + // if (nn == nullptr || nn->GetDis() > dpTree->GetCluR()) + // { + // nn = outres->insert(p, time); + // if (nn->GetRho() > this->minRho) + // { + // outres->remove(nn); + // dpTree->insert(nn, opt); + // } + // } + dpTree->deleteInact(outres, this->minRho, time); + out_timer.Tock(); + return nn; } -double SESAME::V10::computeAlpha() { return dpTree->computeAlpha(this->V10Param.minDelta); } -double SESAME::V10::adjustMinDelta() { return dpTree->adjustMinDelta(this->alpha); } -void SESAME::V10::delCluster() -{ - for (auto it = this->clusters.begin(); it != this->clusters.end();) - { - auto cluster = it->get(); - if (cluster->GetCells().begin() == cluster->GetCells().end()) - { - this->clusters.erase(it++); - } - else - { - it++; - } +double SESAME::V10::computeAlpha() { + return dpTree->computeAlpha(this->V10Param.minDelta); +} +double SESAME::V10::adjustMinDelta() { + return dpTree->adjustMinDelta(this->alpha); +} +void SESAME::V10::delCluster() { + for (auto it = this->clusters.begin(); it != this->clusters.end();) { + auto cluster = it->get(); + if (cluster->GetCells().begin() == cluster->GetCells().end()) { + this->clusters.erase(it++); + } else { + it++; } + } } -SESAME::DPNodePtr SESAME::V10::retrive(SESAME::PointPtr p, int opt, double time) -{ - SESAME::PointPtr curP = p; - if (!this->V10Param.isInit) - { - auto cc = cache->add(curP, time); - if (cache->isFull()) - { - // draw decision graph - InitDP(time); - this->alpha = computeAlpha(); // TODO: what does it mean? - this->V10Param.isInit = true; - } - return cc; +SESAME::DPNodePtr SESAME::V10::retrive(SESAME::PointPtr p, int opt, + double time) { + SESAME::PointPtr curP = p; + if (!this->V10Param.isInit) { + auto cc = cache->add(curP, time); + if (cache->isFull()) { + // draw decision graph + InitDP(time); + this->alpha = computeAlpha(); // TODO: what does it mean? + this->V10Param.isInit = true; } - else - { - ds_timer.Tick(); - auto nn = streamProcess(curP, opt, time); - this->dpTree->adjustCluster(clusters); - ds_timer.Tock(); + return cc; + } else { + ds_timer.Tick(); + auto nn = streamProcess(curP, opt, time); + this->dpTree->adjustCluster(clusters); + ds_timer.Tock(); - out_timer.Tick(); - delCluster(); - out_timer.Tock(); - return nn; - } + out_timer.Tick(); + delCluster(); + out_timer.Tock(); + return nn; + } } -void SESAME::V10::CountNode(const SESAME::DPNodePtr &node, int &num) -{ - num = num + 1; - if (!node->GetSucs().empty()) - { - for (const SESAME::DPNodePtr &el : node->GetSucs()) - { - CountNode(el, num); - } +void SESAME::V10::CountNode(const SESAME::DPNodePtr &node, int &num) { + num = num + 1; + if (!node->GetSucs().empty()) { + for (const SESAME::DPNodePtr &el : node->GetSucs()) { + CountNode(el, num); } + } } -void SESAME::V10::RunOnline(SESAME::PointPtr input) -{ - if (input->getIndex() != 0 and input->getIndex() % V10Param.landmark == 0) - { - for (const auto &cluster : this->clusters) - { - std::unordered_set cells = cluster->GetCells(); - for (const auto &cell : cells) - { - PointPtr center = cell->GetCenter(); - center->setOutlier(false); - onlineCenters.push_back(center->copy()); - } - } - // for (const auto &out : this->outres->getOutliers()) - // { - // PointPtr center = out->GetCenter(); - // center->setOutlier(true); - // onlineCenters.push_back(center->copy()); - // } - DPTreePtr().swap(dpTree); - OutPtr().swap(outres); - CachePtr().swap(cache); - std::unordered_set().swap(clusters); - Init(); - this->alpha = 0; - this->minRho = 0; - this->V10Param.isInit = false; - } - double curTime = input->index; - auto c = retrive(input, this->V10Param.opt, curTime); - if (input->getIndex() % 100 == 0 && this->V10Param.isInit) - { - ds_timer.Tick(); - setMinDelta(adjustMinDelta()); - this->dpTree->adjustCluster(this->clusters); - ds_timer.Tock(); - out_timer.Tick(); - this->delCluster(); - out_timer.Tock(); +void SESAME::V10::RunOnline(SESAME::PointPtr input) { + if (input->getIndex() != 0 and input->getIndex() % V10Param.landmark == 0) { + for (const auto &cluster : this->clusters) { + std::unordered_set cells = cluster->GetCells(); + for (const auto &cell : cells) { + PointPtr center = cell->GetCenter(); + center->setOutlier(false); + onlineCenters.push_back(center->copy()); + } } - lat_timer.Add(input->toa); + // for (const auto &out : this->outres->getOutliers()) + // { + // PointPtr center = out->GetCenter(); + // center->setOutlier(true); + // onlineCenters.push_back(center->copy()); + // } + DPTreePtr().swap(dpTree); + OutPtr().swap(outres); + CachePtr().swap(cache); + std::unordered_set().swap(clusters); + Init(); + this->alpha = 0; + this->minRho = 0; + this->V10Param.isInit = false; + } + double curTime = input->index; + auto c = retrive(input, this->V10Param.opt, curTime); + if (input->getIndex() % 100 == 0 && this->V10Param.isInit) { + ds_timer.Tick(); + setMinDelta(adjustMinDelta()); + this->dpTree->adjustCluster(this->clusters); + ds_timer.Tock(); + out_timer.Tick(); + this->delCluster(); + out_timer.Tock(); + } + lat_timer.Add(input->toa); } -void SESAME::V10::RunOffline(SESAME::DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - auto clu = 0; - for (const auto ¢er : this->onlineCenters) - { - center->setClusteringCenter(clu++); - sinkPtr->put(center->copy()); - } - for (const auto &cluster : this->clusters) - { - std::unordered_set cells = cluster->GetCells(); - for (const auto &cell : cells) - { - PointPtr center = cell->GetCenter(); - center->setClusteringCenter(clu++); - center->setOutlier(false); - sinkPtr->put(center->copy()); - } +void SESAME::V10::RunOffline(SESAME::DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + auto clu = 0; + for (const auto ¢er : this->onlineCenters) { + center->setClusteringCenter(clu++); + sinkPtr->put(center->copy()); + } + for (const auto &cluster : this->clusters) { + std::unordered_set cells = cluster->GetCells(); + for (const auto &cell : cells) { + PointPtr center = cell->GetCenter(); + center->setClusteringCenter(clu++); + center->setOutlier(false); + sinkPtr->put(center->copy()); } - // for (const auto &out : this->outres->getOutliers()) - // { - // PointPtr center = out->GetCenter(); - // center->setClusteringCenter(clu++); - // center->setOutlier(true); - // sinkPtr->put(center->copy()); - // } - ref_timer.Tock(); - sum_timer.Tock(); + } + // for (const auto &out : this->outres->getOutliers()) + // { + // PointPtr center = out->GetCenter(); + // center->setClusteringCenter(clu++); + // center->setOutlier(true); + // sinkPtr->put(center->copy()); + // } + ref_timer.Tock(); + sum_timer.Tock(); } diff --git a/src/Algorithm/DesignAspect/V16.cpp b/src/Algorithm/DesignAspect/V16.cpp index 0d651f67..65cbc7c4 100644 --- a/src/Algorithm/DesignAspect/V16.cpp +++ b/src/Algorithm/DesignAspect/V16.cpp @@ -12,214 +12,186 @@ using namespace std; -SESAME::V16::V16(param_t &cmd_params) -{ - this->param = cmd_params; - param.lambda = 1; - gap = (int)(param.cm - param.cl); - dm = param.cm; - dl = param.cl; - minVals = std::vector(param.dim, DBL_MAX); - maxVals = std::vector(param.dim, DBL_MIN); - Coord = std::vector(param.dim, 0); +SESAME::V16::V16(param_t &cmd_params) { + this->param = cmd_params; + param.lambda = 1; + gap = (int)(param.cm - param.cl); + dm = param.cm; + dl = param.cl; + minVals = std::vector(param.dim, DBL_MAX); + maxVals = std::vector(param.dim, DBL_MIN); + Coord = std::vector(param.dim, 0); } SESAME::V16::~V16() = default; -void SESAME::V16::Init() { - sum_timer.Tick(); -} - -void SESAME::V16::OutputOnline(std::vector &output) -{ - int cluID = 0; - for (const auto &point : onlineCenters) - { - point->setClusteringCenter(cluID++); - output.push_back(point); - } - for (auto iter = 0; iter != this->clusterList.size(); iter++) - { - PointPtr point = DataStructureFactory::createPoint(iter, 0, param.dim, 0); - auto count = 0; - for (auto &iterGrid : this->clusterList.at(iter).grids) - { - for (int iterDim = 0; iterDim < param.dim; iterDim++) - { - if (count == 0) point->setFeatureItem(0, iterDim); - point->setFeatureItem( - point->getFeatureItem(iterDim) + iterGrid.first.coordinates[iterDim], iterDim); - if (count == this->clusterList.at(iter).grids.size() - 1) - { - point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, iterDim); - } - } - double weight = gridList.find(iterGrid.first)->second.gridDensity; - point->setWeight(point->getWeight() + weight); - count++; +void SESAME::V16::Init() { sum_timer.Tick(); } + +void SESAME::V16::OutputOnline(std::vector &output) { + int cluID = 0; + for (const auto &point : onlineCenters) { + point->setClusteringCenter(cluID++); + output.push_back(point); + } + for (auto iter = 0; iter != this->clusterList.size(); iter++) { + PointPtr point = GenericFactory::New(param.dim, iter); + auto count = 0; + for (auto &iterGrid : this->clusterList.at(iter).grids) { + for (int iterDim = 0; iterDim < param.dim; iterDim++) { + if (count == 0) + point->setFeatureItem(0, iterDim); + point->setFeatureItem(point->getFeatureItem(iterDim) + + iterGrid.first.coordinates[iterDim], + iterDim); + if (count == this->clusterList.at(iter).grids.size() - 1) { + point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, + iterDim); } - point->setClusteringCenter(cluID++); - output.push_back(point); - } - + } + double weight = gridList.find(iterGrid.first)->second.gridDensity; + point->setWeight(point->getWeight() + weight); + count++; + } + point->setClusteringCenter(cluID++); + output.push_back(point); + } } -void SESAME::V16::calculateGridCoord(PointPtr point) -{ - for (int i = 0; i < param.dim; i++) - { - auto feature = point->getFeatureItem(i); - if (feature > maxVals[i]) - { - maxVals[i] = feature; - } - else if (feature < minVals[i]) - { - minVals[i] = feature; - } - Coord[i] = point->getFeatureItem(i) / param.grid_width; +void SESAME::V16::calculateGridCoord(PointPtr point) { + for (int i = 0; i < param.dim; i++) { + auto feature = point->getFeatureItem(i); + if (feature > maxVals[i]) { + maxVals[i] = feature; + } else if (feature < minVals[i]) { + minVals[i] = feature; } + Coord[i] = point->getFeatureItem(i) / param.grid_width; + } } -void SESAME::V16::RunOnline(PointPtr input) -{ - currentTimeStamp = input->index; - ds_timer.Tick(); - calculateGridCoord(input); - GridListUpdate(Coord); // tempCoord - if (!init && currentTimeStamp >= gap) - { - initialClustering(); - init = true; - } - if (currentTimeStamp != 0 and currentTimeStamp % gap == 0) - { - removeSporadic(); - adjustClustering(); - } - if(windowGrid.size() == param.sliding + 1) - { - RemoveWindowPointFromGrid(); - } - ds_timer.Tock(); - lat_timer.Add(input->toa); +void SESAME::V16::RunOnline(PointPtr input) { + currentTimeStamp = input->index; + ds_timer.Tick(); + calculateGridCoord(input); + GridListUpdate(Coord); // tempCoord + if (!init && currentTimeStamp >= gap) { + initialClustering(); + init = true; + } + if (currentTimeStamp != 0 and currentTimeStamp % gap == 0) { + removeSporadic(); + adjustClustering(); + } + if (windowGrid.size() == param.sliding + 1) { + RemoveWindowPointFromGrid(); + } + ds_timer.Tock(); + lat_timer.Add(input->toa); } -void SESAME::V16::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - int cluID = 0; - for (const auto &point : onlineCenters) - { - point->setClusteringCenter(cluID++); - sinkPtr->put(point); - } - for (auto iter = 0; iter != this->clusterList.size(); iter++) - { - PointPtr point = DataStructureFactory::createPoint(iter, 0, param.dim, 0); - auto count = 0; - for (auto &iterGrid : this->clusterList.at(iter).grids) - { - for (int iterDim = 0; iterDim < param.dim; iterDim++) - { - if (count == 0) point->setFeatureItem(0, iterDim); - point->setFeatureItem( - point->getFeatureItem(iterDim) + iterGrid.first.coordinates[iterDim], iterDim); - if (count == this->clusterList.at(iter).grids.size() - 1) - { - point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, iterDim); - } - } - double weight = gridList.find(iterGrid.first)->second.gridDensity; - point->setWeight(point->getWeight() + weight); - count++; +void SESAME::V16::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + int cluID = 0; + for (const auto &point : onlineCenters) { + point->setClusteringCenter(cluID++); + sinkPtr->put(point); + } + for (auto iter = 0; iter != this->clusterList.size(); iter++) { + PointPtr point = GenericFactory::New(param.dim, iter); + auto count = 0; + for (auto &iterGrid : this->clusterList.at(iter).grids) { + for (int iterDim = 0; iterDim < param.dim; iterDim++) { + if (count == 0) + point->setFeatureItem(0, iterDim); + point->setFeatureItem(point->getFeatureItem(iterDim) + + iterGrid.first.coordinates[iterDim], + iterDim); + if (count == this->clusterList.at(iter).grids.size() - 1) { + point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, + iterDim); } - point->setClusteringCenter(cluID++); - sinkPtr->put(point); - } - ref_timer.Tock(); - sum_timer.Tock(); + } + double weight = gridList.find(iterGrid.first)->second.gridDensity; + point->setWeight(point->getWeight() + weight); + count++; + } + point->setClusteringCenter(cluID++); + sinkPtr->put(point); + } + ref_timer.Tock(); + sum_timer.Tock(); } /* Update the grid list of V16 when data inserting into the grid * */ -void SESAME::V16::GridListUpdate(const std::vector &coordinate) -{ - CharacteristicVector characteristicVec; - DensityGrid grid(coordinate); - // 3. If (g not in grid_list) insert dg to grid_list - auto it = this->gridList.find(grid); - if (it == gridList.end()) - { - characteristicVec = CharacteristicVector(currentTimeStamp, 0, 1.0, -1, false, dl, dm); - this->gridList.insert(std::make_pair(grid, characteristicVec)); - } - // 4. Update the characteristic vector of dg - else - { - it->second.densityWithNew(currentTimeStamp, param.lambda); - it->second.updateTime = currentTimeStamp; - } - windowGrid.push_back(grid); +void SESAME::V16::GridListUpdate(const std::vector &coordinate) { + CharacteristicVector characteristicVec; + DensityGrid grid(coordinate); + // 3. If (g not in grid_list) insert dg to grid_list + auto it = this->gridList.find(grid); + if (it == gridList.end()) { + characteristicVec = + CharacteristicVector(currentTimeStamp, 0, 1.0, -1, false, dl, dm); + this->gridList.insert(std::make_pair(grid, characteristicVec)); + } + // 4. Update the characteristic vector of dg + else { + it->second.densityWithNew(currentTimeStamp, param.lambda); + it->second.updateTime = currentTimeStamp; + } + windowGrid.push_back(grid); } /* Update the grid list of V16 when data inserting into the grid * */ -void SESAME::V16::RemoveWindowPointFromGrid() -{ - auto grid = windowGrid[0]; - auto it = this->gridList.find(grid); - if (it != gridList.end()) - { - it->second.gridDensity -= 1.0; - } +void SESAME::V16::RemoveWindowPointFromGrid() { + auto grid = windowGrid[0]; + auto it = this->gridList.find(grid); + if (it != gridList.end()) { + it->second.gridDensity -= 1.0; + } } - /** * Implements the procedure given in Figure 3 of Chen and Tu 2007 */ -void SESAME::V16::initialClustering() -{ - // 1. Update the density of all grids in grid_list - // Timer: online grid - updateGridListDensity(); - // 2. Assign each dense grid to a distinct cluster - // and - // 3. Label all other grids as NO_CLASS - auto gridIter = this->gridList.begin(); - HashMap newGridList; - while (gridIter != gridList.end()) - { - DensityGrid grid = gridIter->first; - CharacteristicVector characteristicVecOfG = gridIter->second; - if (characteristicVecOfG.attribute == DENSE) - { - int gridClass = this->clusterList.size(); - characteristicVecOfG.label = gridClass; - GridCluster gridCluster = GridCluster(gridClass); - gridCluster.addGrid(grid); - this->clusterList.push_back(gridCluster); - // // SESAME_INFO(" was dense (class "<clusterList.size()); - } - else - characteristicVecOfG.label = NO_CLASS; - newGridList.insert(std::make_pair(grid, characteristicVecOfG)); - ++gridIter; - } - this->gridList = newGridList; - // 4. Make changes to grid labels by doing: - // a. For each cluster c - // b. For each outside grid g of c - // c. For each neighbouring grid h of g - // d. If h belongs to c', label c and c' with - // the label of the largest cluster - // e. Else if h is transitional, assign it to c - // f. While changes can be made - while (adjustLabels()) - ; // while changes are being made +void SESAME::V16::initialClustering() { + // 1. Update the density of all grids in grid_list + // Timer: online grid + updateGridListDensity(); + // 2. Assign each dense grid to a distinct cluster + // and + // 3. Label all other grids as NO_CLASS + auto gridIter = this->gridList.begin(); + HashMap newGridList; + while (gridIter != gridList.end()) { + DensityGrid grid = gridIter->first; + CharacteristicVector characteristicVecOfG = gridIter->second; + if (characteristicVecOfG.attribute == DENSE) { + int gridClass = this->clusterList.size(); + characteristicVecOfG.label = gridClass; + GridCluster gridCluster = GridCluster(gridClass); + gridCluster.addGrid(grid); + this->clusterList.push_back(gridCluster); + // // SESAME_INFO(" was dense (class "<clusterList.size()); + } else + characteristicVecOfG.label = NO_CLASS; + newGridList.insert(std::make_pair(grid, characteristicVecOfG)); + ++gridIter; + } + this->gridList = newGridList; + // 4. Make changes to grid labels by doing: + // a. For each cluster c + // b. For each outside grid g of c + // c. For each neighbouring grid h of g + // d. If h belongs to c', label c and c' with + // the label of the largest cluster + // e. Else if h is transitional, assign it to c + // f. While changes can be made + while (adjustLabels()) + ; // while changes are being made } /** * Makes first change available to it by following the steps: @@ -230,87 +202,77 @@ void SESAME::V16::initialClustering() * Else if h is transitional, assign it to c * @return TRUE if a change was made to any cluster's labels, FALSE otherwise */ -bool SESAME::V16::adjustLabels() -{ - // bool adjust=false; - // a. For each cluster c - for (GridCluster &gridCluster : this->clusterList) - { - // // SESAME_INFO("Adjusting from cluster "<first; - bool inside = gridIter->second; - // // SESAME_INFO(" Inspecting density grid, grid, standby..."); - - // b. for each OUTSIDE grid of cluster - if (!inside) - { - // // SESAME_INFO(" Density grid dg is outside!"); - // c. for each neighbouring grid, of current iter grid - for (const DensityGrid &gridNeighbourhood : grid.getNeighbours()) - { - auto it2 = this->gridList.find(gridNeighbourhood); - if (it2 != gridList.end()) - { - auto it1 = this->gridList.find(grid); - CharacteristicVector characteristicVec1 = it1->second; - CharacteristicVector characteristicVec2 = it2->second; - int class1 = characteristicVec1.label; - int class2 = characteristicVec2.label; - // ...and if neighbouring grid isn't already in the same cluster as - // grid... - if (class1 != class2) - { - // If neighbouring grid is in cluster c', merge c and c' into the - // larger of the two - if (class2 != NO_CLASS) - { - if (this->clusterList.at(class1).grids.size() < - this->clusterList.at(class2).grids.size()) - mergeClusters(class1, class2); - else - mergeClusters(class2, class1); - return true; - } - // If gridNeighbourhood is transitional and 'outside' of the - // cluster, assign it to cluster - else if (characteristicVec2.isTransitional(dm, dl)) - { - characteristicVec2.label = class1; - gridCluster.addGrid(gridNeighbourhood); - this->clusterList.at(class1) = gridCluster; - if (it1 != gridList.end()) - it1->second = characteristicVec2; - else - this->gridList.insert( - std::make_pair(grid, characteristicVec2)); - return true; - } - } - } - } +bool SESAME::V16::adjustLabels() { + // bool adjust=false; + // a. For each cluster c + for (GridCluster &gridCluster : this->clusterList) { + // // SESAME_INFO("Adjusting from cluster "<first; + bool inside = gridIter->second; + // // SESAME_INFO(" Inspecting density grid, grid, standby..."); + + // b. for each OUTSIDE grid of cluster + if (!inside) { + // // SESAME_INFO(" Density grid dg is outside!"); + // c. for each neighbouring grid, of current iter grid + for (const DensityGrid &gridNeighbourhood : grid.getNeighbours()) { + auto it2 = this->gridList.find(gridNeighbourhood); + if (it2 != gridList.end()) { + auto it1 = this->gridList.find(grid); + CharacteristicVector characteristicVec1 = it1->second; + CharacteristicVector characteristicVec2 = it2->second; + int class1 = characteristicVec1.label; + int class2 = characteristicVec2.label; + // ...and if neighbouring grid isn't already in the same cluster + // as grid... + if (class1 != class2) { + // If neighbouring grid is in cluster c', merge c and c' into + // the larger of the two + if (class2 != NO_CLASS) { + if (this->clusterList.at(class1).grids.size() < + this->clusterList.at(class2).grids.size()) + mergeClusters(class1, class2); + else + mergeClusters(class2, class1); + return true; } + // If gridNeighbourhood is transitional and 'outside' of the + // cluster, assign it to cluster + else if (characteristicVec2.isTransitional(dm, dl)) { + characteristicVec2.label = class1; + gridCluster.addGrid(gridNeighbourhood); + this->clusterList.at(class1) = gridCluster; + if (it1 != gridList.end()) + it1->second = characteristicVec2; + else + this->gridList.insert( + std::make_pair(grid, characteristicVec2)); + return true; + } + } } + } } + } } - return false; + } + return false; } /** - * Iterates through grid_list and updates the density for each density grid therein. - * Also marks each density grid as unvisited for this call to adjustClustering. + * Iterates through grid_list and updates the density for each density grid + * therein. Also marks each density grid as unvisited for this call to + * adjustClustering. */ -void SESAME::V16::updateGridListDensity() -{ - for (auto &iter : this->gridList) - { - iter.second.isVisited = false; - iter.second.UpdateAllDensity(currentTimeStamp, param.lambda, dl, dm); - } +void SESAME::V16::updateGridListDensity() { + for (auto &iter : this->gridList) { + iter.second.isVisited = false; + iter.second.UpdateAllDensity(currentTimeStamp, param.lambda, dl, dm); + } } /** @@ -319,66 +281,65 @@ void SESAME::V16::updateGridListDensity() * * @see moa.clusterers.V16.V16#gap */ -void SESAME::V16::adjustClustering() -{ - // SESAME_INFO("ADJUST CLUSTERING CALLED "); - // 1. Update the density of all grids in grid_list - updateGridListDensity(); - // 2. For each grid dg whose attribute is changed since last call - // a. If dg is sparse - // b. If dg is dense - // c. If dg is transitional - while (inspectChangedGrids()) - ; +void SESAME::V16::adjustClustering() { + // SESAME_INFO("ADJUST CLUSTERING CALLED "); + // 1. Update the density of all grids in grid_list + updateGridListDensity(); + // 2. For each grid dg whose attribute is changed since last call + // a. If dg is sparse + // b. If dg is dense + // c. If dg is transitional + while (inspectChangedGrids()) + ; } /** - * Inspects each density grid in grid_list whose attribute has changed since the last - * call to adjustClustering. Implements lines 3/4/7/19 of the procedure given in Figure - * 4 of Chen and Tu 2007. + * Inspects each density grid in grid_list whose attribute has changed since the + * last call to adjustClustering. Implements lines 3/4/7/19 of the procedure + * given in Figure 4 of Chen and Tu 2007. * * @return TRUE if any grids are updated; FALSE otherwise. */ -bool SESAME::V16::inspectChangedGrids() -{ - HashMap newGridList; - auto gridIter = this->gridList.begin(); - int a = 0; - while (gridIter != gridList.end()) //&& newGridList.empty() - { - const DensityGrid &grid = gridIter->first; - const CharacteristicVector &characteristicVec = gridIter->second; - int gridClass = characteristicVec.label; - if (characteristicVec.attChange && !characteristicVec.isVisited) // grid.isVisited - { // grid.isVisited=true; - gridIter->second.isVisited = true; - newGridList.insert(std::make_pair(grid, characteristicVec)); - if (characteristicVec.attribute == SPARSE) - mergeGridList(newGridList, adjustForSparseGrid(grid, characteristicVec, gridClass)); - else if (characteristicVec.attribute == DENSE) - mergeGridList(newGridList, adjustForDenseGrid(grid, characteristicVec, gridClass)); - else // TRANSITIONAL - mergeGridList(newGridList, - adjustForTransitionalGrid(grid, characteristicVec, gridClass)); - } - ++gridIter; - a++; - } - // If there are grids in new grid list, update the corresponding grids in grid_list and clean up - // the cluster list - if (!newGridList.empty()) - { - mergeGridList(this->gridList, newGridList); - cleanClusters(); - return true; - } - else - return false; +bool SESAME::V16::inspectChangedGrids() { + HashMap newGridList; + auto gridIter = this->gridList.begin(); + int a = 0; + while (gridIter != gridList.end()) //&& newGridList.empty() + { + const DensityGrid &grid = gridIter->first; + const CharacteristicVector &characteristicVec = gridIter->second; + int gridClass = characteristicVec.label; + if (characteristicVec.attChange && + !characteristicVec.isVisited) // grid.isVisited + { // grid.isVisited=true; + gridIter->second.isVisited = true; + newGridList.insert(std::make_pair(grid, characteristicVec)); + if (characteristicVec.attribute == SPARSE) + mergeGridList(newGridList, + adjustForSparseGrid(grid, characteristicVec, gridClass)); + else if (characteristicVec.attribute == DENSE) + mergeGridList(newGridList, + adjustForDenseGrid(grid, characteristicVec, gridClass)); + else // TRANSITIONAL + mergeGridList(newGridList, adjustForTransitionalGrid( + grid, characteristicVec, gridClass)); + } + ++gridIter; + a++; + } + // If there are grids in new grid list, update the corresponding grids in + // grid_list and clean up the cluster list + if (!newGridList.empty()) { + mergeGridList(this->gridList, newGridList); + cleanClusters(); + return true; + } else + return false; } /** - * Adjusts the clustering of a sparse density grid. Implements lines 5 and 6 from Figure 4 of Chen - * and Tu 2007. + * Adjusts the clustering of a sparse density grid. Implements lines 5 and 6 + * from Figure 4 of Chen and Tu 2007. * * @param dg the sparse density grid being adjusted * @param cv the characteristic vector of dg @@ -386,153 +347,139 @@ bool SESAME::V16::inspectChangedGrids() * * @return a HashMap containing density grids for update after this iteration */ -SESAME::HashMap SESAME::V16::adjustForSparseGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - HashMap newGridList; - if (gridClass != NO_CLASS) - { - //// SESAME_INFO("It is removed from cluster "< containing density grids for update after - * this iteration + * @return a HashMap containing density grids + * for update after this iteration */ -SESAME::HashMap SESAME::V16::reCluster(GridCluster &gridCluster) -{ - // SESAME_INFO("Now re-cluster!"); - HashMap newGridList; - auto gcIter = gridCluster.grids.begin(); - // // SESAME_INFO("ReCluster called for cluster "<first; - CharacteristicVector characteristicVecOfGrid = this->gridList.find(grid)->second; - if (characteristicVecOfGrid.attribute == DENSE) - { - int gridClass = (int)newClusterList.size(); - characteristicVecOfGrid.label = gridClass; - GridCluster newCluster(gridClass); - newCluster.addGrid(grid); - newClusterList.push_back(newCluster); - } - else - characteristicVecOfGrid.label = NO_CLASS; - newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); - gcIter++; - } - - bool changesMade; - // While changes can be made... - do - { - changesMade = false; - HashMap gridListAdjusted = adjustNewLabels(newGridList); - if (!gridListAdjusted.empty()) - { - // SESAME_INFO("grid list is adjusted for sparse!"); - mergeGridList(newGridList, gridListAdjusted); - changesMade = true; - } - } while (changesMade); - - // Update the cluster list with the newly formed clusters - gridCluster.grids.clear(); - this->clusterList.at(gridCluster.clusterLabel) = gridCluster; - for (GridCluster &cluster : newClusterList) this->clusterList.push_back(cluster); - return newGridList; +SESAME::HashMap SESAME::V16::reCluster(GridCluster &gridCluster) { + // SESAME_INFO("Now re-cluster!"); + HashMap newGridList; + auto gcIter = gridCluster.grids.begin(); + // // SESAME_INFO("ReCluster called for cluster "<first; + CharacteristicVector characteristicVecOfGrid = + this->gridList.find(grid)->second; + if (characteristicVecOfGrid.attribute == DENSE) { + int gridClass = (int)newClusterList.size(); + characteristicVecOfGrid.label = gridClass; + GridCluster newCluster(gridClass); + newCluster.addGrid(grid); + newClusterList.push_back(newCluster); + } else + characteristicVecOfGrid.label = NO_CLASS; + newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); + gcIter++; + } + + bool changesMade; + // While changes can be made... + do { + changesMade = false; + HashMap gridListAdjusted = adjustNewLabels(newGridList); + if (!gridListAdjusted.empty()) { + // SESAME_INFO("grid list is adjusted for sparse!"); + mergeGridList(newGridList, gridListAdjusted); + changesMade = true; + } + } while (changesMade); + + // Update the cluster list with the newly formed clusters + gridCluster.grids.clear(); + this->clusterList.at(gridCluster.clusterLabel) = gridCluster; + for (GridCluster &cluster : newClusterList) + this->clusterList.push_back(cluster); + return newGridList; } -SESAME::HashMap SESAME::V16::adjustNewLabels(const SESAME::HashMap &newGridList) -{ - HashMap gridListAdjusted; - // a. For each cluster c - for (GridCluster &gridCluster : newClusterList) - { - for (auto &gridIter : gridCluster.grids) - { - DensityGrid grid = gridIter.first; - bool inside = gridIter.second; - - // b. for each OUTSIDE grid, dg, of c - if (!inside) - { - // c. for each neighbouring grid, neighbourGrid, of dg - - for (DensityGrid &neighbourGrid : grid.getNeighbours()) - { - if (newGridList.find(neighbourGrid) != newGridList.end()) - { - CharacteristicVector characteristicVec1 = - newGridList.find(neighbourGrid)->second; - CharacteristicVector characteristicVec2 = - newGridList.find(neighbourGrid)->second; - int class1 = characteristicVec1.label; - int class2 = characteristicVec2.label; - - // ...and if neighbourGrid isn't already in the same cluster as dg... - if (class1 != class2) - { - GridCluster cluster1 = newClusterList.at(class1); - // If dgprime is in cluster c', merge c and c' into the larger of the - // two - if (class2 != NO_CLASS) - { - GridCluster cluster2 = newClusterList.at(class2); - // System.out.println("C is "+class1+" and C' is "+class2+"."); - if (cluster1.grids.size() < cluster2.grids.size()) - mergeGridList(gridListAdjusted, - mergeNewClusters(newGridList, class1, class2)); - else - mergeGridList(gridListAdjusted, - mergeNewClusters(newGridList, class2, class1)); - - return gridListAdjusted; - } - // If neighbourGrid is transitional and outside of cluster, assign it to - // cluster - else if (characteristicVec2.isTransitional(dm, dl)) - { - characteristicVec2.label = class1; - cluster1.addGrid(neighbourGrid); - this->newClusterList.at(class1) = cluster1; - gridListAdjusted.insert( - std::make_pair(neighbourGrid, characteristicVec2)); - return gridListAdjusted; - } - } - } - } +SESAME::HashMap +SESAME::V16::adjustNewLabels(const SESAME::HashMap &newGridList) { + HashMap gridListAdjusted; + // a. For each cluster c + for (GridCluster &gridCluster : newClusterList) { + for (auto &gridIter : gridCluster.grids) { + DensityGrid grid = gridIter.first; + bool inside = gridIter.second; + + // b. for each OUTSIDE grid, dg, of c + if (!inside) { + // c. for each neighbouring grid, neighbourGrid, of dg + + for (DensityGrid &neighbourGrid : grid.getNeighbours()) { + if (newGridList.find(neighbourGrid) != newGridList.end()) { + CharacteristicVector characteristicVec1 = + newGridList.find(neighbourGrid)->second; + CharacteristicVector characteristicVec2 = + newGridList.find(neighbourGrid)->second; + int class1 = characteristicVec1.label; + int class2 = characteristicVec2.label; + + // ...and if neighbourGrid isn't already in the same cluster as + // dg... + if (class1 != class2) { + GridCluster cluster1 = newClusterList.at(class1); + // If dgprime is in cluster c', merge c and c' into the larger of + // the two + if (class2 != NO_CLASS) { + GridCluster cluster2 = newClusterList.at(class2); + // System.out.println("C is "+class1+" and C' is "+class2+"."); + if (cluster1.grids.size() < cluster2.grids.size()) + mergeGridList(gridListAdjusted, + mergeNewClusters(newGridList, class1, class2)); + else + mergeGridList(gridListAdjusted, + mergeNewClusters(newGridList, class2, class1)); + + return gridListAdjusted; + } + // If neighbourGrid is transitional and outside of cluster, assign + // it to cluster + else if (characteristicVec2.isTransitional(dm, dl)) { + characteristicVec2.label = class1; + cluster1.addGrid(neighbourGrid); + this->newClusterList.at(class1) = cluster1; + gridListAdjusted.insert( + std::make_pair(neighbourGrid, characteristicVec2)); + return gridListAdjusted; + } } + } } + } } - return gridListAdjusted; + } + return gridListAdjusted; } /** - * Adjusts the clustering of a dense density grid. Implements lines 8 through 18 from Figure 4 of - * Chen and Tu 2007. + * Adjusts the clustering of a dense density grid. Implements lines 8 through 18 + * from Figure 4 of Chen and Tu 2007. * * @param grid the dense density grid being adjusted * @param characteristicVec the characteristic vector of dg @@ -540,241 +487,223 @@ SESAME::HashMap SESAME::V16::adjustNewLabels(const SESAME::HashMap &newGridList) * * @return a HashMapcontaining density grids for update after this iteration */ -SESAME::HashMap SESAME::V16::adjustForDenseGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - // Among all neighbours of dg, find the grid h whose cluster ch has the largest size - GridCluster gridCluster; // The cluster, ch, of h - DensityGrid gridChosen(grid); // The chosen grid h, whose cluster ch has the largest size - double ChosenGridSize = -1.0; // The size of gridCluster, the largest cluster - int hClass = NO_CLASS; // The class label of h - int hChosenClass = NO_CLASS; // The class label of ch - - HashMap newGridList; - //// SESAME_INFO("adjust For Dense Grid "<gridList.find(neighbourGrid) != gridList.end()) - { - hClass = this->gridList.find(neighbourGrid)->second.label; - if (hClass != NO_CLASS) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hClass) - { - if (gridCluster.grids.size() > ChosenGridSize) - { - ChosenGridSize = gridCluster.grids.size(); - hChosenClass = hClass; - gridChosen = DensityGrid(neighbourGrid); - } - } - } +SESAME::HashMap +SESAME::V16::adjustForDenseGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass) { + // Among all neighbours of dg, find the grid h whose cluster ch has the + // largest size + GridCluster gridCluster; // The cluster, ch, of h + DensityGrid gridChosen( + grid); // The chosen grid h, whose cluster ch has the largest size + double ChosenGridSize = -1.0; // The size of gridCluster, the largest cluster + int hClass = NO_CLASS; // The class label of h + int hChosenClass = NO_CLASS; // The class label of ch + + HashMap newGridList; + //// SESAME_INFO("adjust For Dense Grid "<gridList.find(neighbourGrid) != gridList.end()) { + hClass = this->gridList.find(neighbourGrid)->second.label; + if (hClass != NO_CLASS) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hClass) { + if (gridCluster.grids.size() > ChosenGridSize) { + ChosenGridSize = gridCluster.grids.size(); + hChosenClass = hClass; + gridChosen = DensityGrid(neighbourGrid); } + } } + } } + } - if (hChosenClass != NO_CLASS && hChosenClass != gridClass) - { - gridCluster = this->clusterList.at(hChosenClass); - - // If h is a dense grid - if (this->gridList.find(gridChosen)->second.attribute == DENSE) - { - // // SESAME_INFO("h is dense."); - // If dg is labelled as NO_CLASS - if (gridClass == NO_CLASS) - { - // // SESAME_INFO("g was labelled NO_CLASS"); - characteristicVec.label = hChosenClass; - newGridList.insert(std::make_pair(grid, characteristicVec)); - gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - } - // Else if dg belongs to cluster c and h belongs to c' - else - { - // // SESAME_INFO("g was labelled "<clusterList.at(gridClass).grids.size(); - - if (gSize <= ChosenGridSize) - mergeClusters(gridClass, hChosenClass); - else - mergeClusters(hChosenClass, gridClass); - } - } + if (hChosenClass != NO_CLASS && hChosenClass != gridClass) { + gridCluster = this->clusterList.at(hChosenClass); - // Else if h is a transitional grid - else if (this->gridList.at(gridChosen).attribute == TRANSITIONAL) - { - // // SESAME_INFO("h is transitional."); - // If dg is labelled as no class and if h is an outside grid if dg is added to ch - if (gridClass == NO_CLASS && !gridCluster.isInside(gridChosen, grid)) - { - characteristicVec.label = hChosenClass; - newGridList.insert(std::make_pair(grid, characteristicVec)); - gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - // // SESAME_INFO(" dg is added to cluster "<= |ch| - else if (gridClass != NO_CLASS) - { - GridCluster c = this->clusterList.at(gridClass); - double gSize = c.grids.size(); - - if (gSize >= ChosenGridSize) - { - // Move h from cluster ch to cluster c - gridCluster.removeGrid(gridChosen); - c.addGrid(gridChosen); - CharacteristicVector cvhChosen = this->gridList.find(gridChosen)->second; - cvhChosen.label = gridClass; - newGridList.insert(std::make_pair(gridChosen, cvhChosen)); - // // SESAME_INFO("dgClass is "<clusterList.at(hChosenClass) = gridCluster; - this->clusterList.at(gridClass) = c; - } - } - } - } - // If dgClass is dense and not in a cluster, and none if its neighbours are in a cluster, - // put it in its own new cluster and search the neighbourhood for transitional or dense - // grids to add - else if (gridClass == NO_CLASS) - { - int newClass = (int)this->clusterList.size(); - GridCluster c = GridCluster(newClass); - c.addGrid(grid); - // System.out.println("Added "+dg.toString()+" to cluster "+newClass+"."); - this->clusterList.push_back(c); - characteristicVec.label = newClass; - if (newGridList.find(grid) != newGridList.end()) - newGridList.find(grid)->second = characteristicVec; + // If h is a dense grid + if (this->gridList.find(gridChosen)->second.attribute == DENSE) { + // // SESAME_INFO("h is dense."); + // If dg is labelled as NO_CLASS + if (gridClass == NO_CLASS) { + // // SESAME_INFO("g was labelled NO_CLASS"); + characteristicVec.label = hChosenClass; + newGridList.insert(std::make_pair(grid, characteristicVec)); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; + } + // Else if dg belongs to cluster c and h belongs to c' + else { + // // SESAME_INFO("g was labelled "<clusterList.at(gridClass).grids.size(); + + if (gSize <= ChosenGridSize) + mergeClusters(gridClass, hChosenClass); else - newGridList.insert(std::make_pair(grid, characteristicVec)); - // Iterate through the neighbourhood until no more transitional neighbours can be added - // (dense neighbours will add themselves as part of their adjust process) - for (DensityGrid &dghprime : grid.getNeighbours()) - { - if (this->gridList.find(dghprime) != this->gridList.end() && - c.grids.find(dghprime) != c.grids.end()) - { - CharacteristicVector cvhprime = this->gridList.find(dghprime)->second; - if (cvhprime.attribute == TRANSITIONAL) - { - c.addGrid(dghprime); - cvhprime.label = newClass; - newGridList.insert(std::make_pair(dghprime, cvhprime)); - } - } + mergeClusters(hChosenClass, gridClass); + } + } + + // Else if h is a transitional grid + else if (this->gridList.at(gridChosen).attribute == TRANSITIONAL) { + // // SESAME_INFO("h is transitional."); + // If dg is labelled as no class and if h is an outside grid if dg is + // added to ch + if (gridClass == NO_CLASS && !gridCluster.isInside(gridChosen, grid)) { + characteristicVec.label = hChosenClass; + newGridList.insert(std::make_pair(grid, characteristicVec)); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; + // // SESAME_INFO(" dg is added to cluster "<= |ch| + else if (gridClass != NO_CLASS) { + GridCluster c = this->clusterList.at(gridClass); + double gSize = c.grids.size(); + + if (gSize >= ChosenGridSize) { + // Move h from cluster ch to cluster c + gridCluster.removeGrid(gridChosen); + c.addGrid(gridChosen); + CharacteristicVector cvhChosen = + this->gridList.find(gridChosen)->second; + cvhChosen.label = gridClass; + newGridList.insert(std::make_pair(gridChosen, cvhChosen)); + // // SESAME_INFO("dgClass is "<clusterList.at(hChosenClass) = gridCluster; + this->clusterList.at(gridClass) = c; + } + } + } + } + // If dgClass is dense and not in a cluster, and none if its neighbours are in + // a cluster, put it in its own new cluster and search the neighbourhood for + // transitional or dense grids to add + else if (gridClass == NO_CLASS) { + int newClass = (int)this->clusterList.size(); + GridCluster c = GridCluster(newClass); + c.addGrid(grid); + // System.out.println("Added "+dg.toString()+" to cluster "+newClass+"."); + this->clusterList.push_back(c); + characteristicVec.label = newClass; + if (newGridList.find(grid) != newGridList.end()) + newGridList.find(grid)->second = characteristicVec; + else + newGridList.insert(std::make_pair(grid, characteristicVec)); + // Iterate through the neighbourhood until no more transitional neighbours + // can be added (dense neighbours will add themselves as part of their + // adjust process) + for (DensityGrid &dghprime : grid.getNeighbours()) { + if (this->gridList.find(dghprime) != this->gridList.end() && + c.grids.find(dghprime) != c.grids.end()) { + CharacteristicVector cvhprime = this->gridList.find(dghprime)->second; + if (cvhprime.attribute == TRANSITIONAL) { + c.addGrid(dghprime); + cvhprime.label = newClass; + newGridList.insert(std::make_pair(dghprime, cvhprime)); } - this->clusterList.at(newClass) = c; + } } + this->clusterList.at(newClass) = c; + } - return newGridList; + return newGridList; } /** - * Adjusts the clustering of a transitional density grid. Implements lines 20 and 21 from Figure 4 - * of Chen and Tu 2007. + * Adjusts the clustering of a transitional density grid. Implements lines 20 + * and 21 from Figure 4 of Chen and Tu 2007. * * @param dg the dense density grid being adjusted * @param cv the characteristic vector of dg * @param dgClass the cluster to which dg belonged * - * @return a HashMap containing density grids for update after - * this iteration + * @return a HashMap containing density grids + * for update after this iteration */ -SESAME::HashMap SESAME::V16::adjustForTransitionalGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - // Among all neighbours of dg, find the grid h whose cluster ch has the largest size - // and satisfies that dg would be an outside grid if added to it - GridCluster gridCluster; // The cluster, ch, of h - double hChosenSize = 0.0; // The size of ch, the largest cluster - DensityGrid neighbourGrid; // The neighbour of dg being considered - int hClass = NO_CLASS; // The class label of h - int hChosenClass = NO_CLASS; // The class label of ch - HashMap newGridList; - //// SESAME_INFO("adjust For Transitional Grid "<gridList.find(neighbourGrid); - if (it != gridList.end()) - { - hClass = it->second.label; - ; - if (hClass != NO_CLASS) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hClass) - { - if ((gridCluster.grids.size() > hChosenSize) && - !gridCluster.isInside(grid, grid)) - { - hChosenSize = gridCluster.grids.size(); - hChosenClass = hClass; - } - } - } +SESAME::HashMap +SESAME::V16::adjustForTransitionalGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass) { + // Among all neighbours of dg, find the grid h whose cluster ch has the + // largest size and satisfies that dg would be an outside grid if added to it + GridCluster gridCluster; // The cluster, ch, of h + double hChosenSize = 0.0; // The size of ch, the largest cluster + DensityGrid neighbourGrid; // The neighbour of dg being considered + int hClass = NO_CLASS; // The class label of h + int hChosenClass = NO_CLASS; // The class label of ch + HashMap newGridList; + //// SESAME_INFO("adjust For Transitional Grid "<gridList.find(neighbourGrid); + if (it != gridList.end()) { + hClass = it->second.label; + ; + if (hClass != NO_CLASS) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hClass) { + if ((gridCluster.grids.size() > hChosenSize) && + !gridCluster.isInside(grid, grid)) { + hChosenSize = gridCluster.grids.size(); + hChosenClass = hClass; } + } } + } } + } - if (hChosenClass != NO_CLASS && hChosenClass != gridClass) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hChosenClass) gridCluster.addGrid(grid); - if (gridCluster.clusterLabel == gridClass and gridClass != NO_CLASS) - gridCluster.removeGrid(grid); - } - gridCluster = this->clusterList.at(hChosenClass); + if (hChosenClass != NO_CLASS && hChosenClass != gridClass) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hChosenClass) gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - - characteristicVec.label = hChosenClass; - newGridList.insert(std::make_pair(grid, characteristicVec)); + if (gridCluster.clusterLabel == gridClass and gridClass != NO_CLASS) + gridCluster.removeGrid(grid); } + gridCluster = this->clusterList.at(hChosenClass); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; - return newGridList; -} - -SESAME::HashMap SESAME::V16::mergeNewClusters(SESAME::HashMap newGridList, int smallCluster, - int bigCluster) -{ - // System.out.println("Merge new clusters "+smallCluster+" and "+bigCluster+"."); - // Iterate through the density grids in grid_list to find those which are in highClass - for (HashMap::iterator gridIter = newGridList.begin(); gridIter != gridList.end(); gridIter++) - { - DensityGrid grid = gridIter->first; - CharacteristicVector characteristicVec = gridIter->second; - - // Assign density grids in small Cluster to bigCluster - if (characteristicVec.label == smallCluster) - { - characteristicVec.label = bigCluster; - newGridList.insert(std::make_pair(grid, characteristicVec)); - } - } - // SESAME_INFO("Density grids assigned to cluster "<newClusterList.at(bigCluster); - bGC.absorbCluster(this->newClusterList.at(smallCluster)); - this->newClusterList.at(bigCluster) = bGC; - this->newClusterList.erase(this->newClusterList.begin() + smallCluster); - // System.out.println("Cluster "+smallClus+" removed from list."); - newGridList = cleanNewClusters(newGridList); + return newGridList; +} - return newGridList; +SESAME::HashMap SESAME::V16::mergeNewClusters(SESAME::HashMap newGridList, + int smallCluster, + int bigCluster) { + // System.out.println("Merge new clusters "+smallCluster+" and + // "+bigCluster+"."); Iterate through the density grids in grid_list to find + // those which are in highClass + for (HashMap::iterator gridIter = newGridList.begin(); + gridIter != gridList.end(); gridIter++) { + DensityGrid grid = gridIter->first; + CharacteristicVector characteristicVec = gridIter->second; + + // Assign density grids in small Cluster to bigCluster + if (characteristicVec.label == smallCluster) { + characteristicVec.label = bigCluster; + newGridList.insert(std::make_pair(grid, characteristicVec)); + } + } + // SESAME_INFO("Density grids assigned to cluster "<newClusterList.at(bigCluster); + bGC.absorbCluster(this->newClusterList.at(smallCluster)); + this->newClusterList.at(bigCluster) = bGC; + this->newClusterList.erase(this->newClusterList.begin() + smallCluster); + // System.out.println("Cluster "+smallClus+" removed from list."); + newGridList = cleanNewClusters(newGridList); + + return newGridList; } /** @@ -784,133 +713,122 @@ SESAME::HashMap SESAME::V16::mergeNewClusters(SESAME::HashMap newGridList, int s * @param smallCluster - the index of the smaller cluster * @param bigCluster - the index of the bigger cluster */ -void SESAME::V16::mergeClusters(int smallCluster, int bigCluster) -{ - // SESAME_INFO("Merge clusters "<second.label == smallCluster) - { - gridIter->second.label = bigCluster; - } - } - // SESAME_INFO("Density grids assigned to cluster "<clusterList.at(bigCluster); - bigGridCluster.absorbCluster(this->clusterList.at(smallCluster)); - this->clusterList.at(bigCluster) = bigGridCluster; - this->clusterList.erase(clusterList.begin() + smallCluster); - // SESAME_INFO("Cluster "<second.label == smallCluster) { + gridIter->second.label = bigCluster; + } + } + // SESAME_INFO("Density grids assigned to cluster "<clusterList.at(bigCluster); + bigGridCluster.absorbCluster(this->clusterList.at(smallCluster)); + this->clusterList.at(bigCluster) = bigGridCluster; + this->clusterList.erase(clusterList.begin() + smallCluster); + // SESAME_INFO("Cluster "< toRemove; - // Check to see if there are any empty clusters - for (auto &cluster : this->newClusterList) - { - if (cluster.grids.empty()) toRemove.push_back(cluster); - } - - // Remove empty clusters - if (!toRemove.empty()) - { - for (auto &RemoveCluster : toRemove) - { - auto removeCIter = - std::find(newClusterList.begin(), newClusterList.end(), RemoveCluster); - if (std::find(newClusterList.begin(), newClusterList.end(), RemoveCluster) != - newClusterList.end()) - this->newClusterList.erase(removeCIter); - } - } - for (auto &cluster : this->newClusterList) - { - auto clusterIter = std::find(newClusterList.begin(), newClusterList.end(), cluster); - int index = (int)std::distance(newClusterList.begin(), clusterIter); - cluster.clusterLabel = index; - unordered_map removeGrids; - for (auto &gridOfCluster : cluster.grids) - { - DensityGrid grid = gridOfCluster.first; - ; - if (newGridList.find(grid) != newGridList.end()) - newGridList.find(grid)->second.label = index; - else - removeGrids.insert(gridOfCluster); - } - for (auto &grid : removeGrids) - { - if (cluster.grids.find(grid.first) != cluster.grids.end()) - cluster.grids.erase(grid.first); - } - } - // SESAME_INFO("Clean finish!"); - return newGridList; +SESAME::HashMap SESAME::V16::cleanNewClusters(SESAME::HashMap newGridList) { + std::vector toRemove; + // Check to see if there are any empty clusters + for (auto &cluster : this->newClusterList) { + if (cluster.grids.empty()) + toRemove.push_back(cluster); + } + + // Remove empty clusters + if (!toRemove.empty()) { + for (auto &RemoveCluster : toRemove) { + auto removeCIter = std::find(newClusterList.begin(), newClusterList.end(), + RemoveCluster); + if (std::find(newClusterList.begin(), newClusterList.end(), + RemoveCluster) != newClusterList.end()) + this->newClusterList.erase(removeCIter); + } + } + for (auto &cluster : this->newClusterList) { + auto clusterIter = + std::find(newClusterList.begin(), newClusterList.end(), cluster); + int index = (int)std::distance(newClusterList.begin(), clusterIter); + cluster.clusterLabel = index; + unordered_map removeGrids; + for (auto &gridOfCluster : cluster.grids) { + DensityGrid grid = gridOfCluster.first; + ; + if (newGridList.find(grid) != newGridList.end()) + newGridList.find(grid)->second.label = index; + else + removeGrids.insert(gridOfCluster); + } + for (auto &grid : removeGrids) { + if (cluster.grids.find(grid.first) != cluster.grids.end()) + cluster.grids.erase(grid.first); + } + } + // SESAME_INFO("Clean finish!"); + return newGridList; } // TODO stop right here and confused about get() /** - * Iterates through cluster_list to ensure that all empty clusters have been removed and - * that all cluster IDs match the cluster's index in cluster_list. + * Iterates through cluster_list to ensure that all empty clusters have been + * removed and that all cluster IDs match the cluster's index in cluster_list. */ -void SESAME::V16::cleanClusters() -{ - //// SESAME_INFO("Clean Clusters"); - - std::vector toRemove; - - // Check to see if there are any empty clusters - for (auto &cluster : this->clusterList) - { - if (cluster.grids.empty()) toRemove.push_back(cluster); - } - // Remove empty clusters - if (!toRemove.empty()) - { - for (auto &RemoveCluster : toRemove) - { - auto removeCIter = std::find(clusterList.begin(), clusterList.end(), RemoveCluster); - if (std::find(clusterList.begin(), clusterList.end(), RemoveCluster) != - clusterList.end()) - this->clusterList.erase(removeCIter); - } - } - // Adjust remaining clusters as necessary, index = label = order - for (auto &cluster : this->clusterList) - { - auto clusterIter = std::find(clusterList.begin(), clusterList.end(), cluster); - int index = (int)std::distance(clusterList.begin(), clusterIter); - cluster.clusterLabel = index; - unordered_map removeGrids; - for (auto &gridOfCluster : cluster.grids) - { - DensityGrid grid = gridOfCluster.first; - if (gridList.find(grid) != gridList.end()) - gridList.find(grid)->second.label = index; - else - removeGrids.insert(gridOfCluster); - } - for (auto &grid : removeGrids) - { - if (cluster.grids.find(grid.first) != cluster.grids.end()) - cluster.grids.erase(grid.first); - } - this->clusterList.at(index) = cluster; - } +void SESAME::V16::cleanClusters() { + //// SESAME_INFO("Clean Clusters"); + + std::vector toRemove; + + // Check to see if there are any empty clusters + for (auto &cluster : this->clusterList) { + if (cluster.grids.empty()) + toRemove.push_back(cluster); + } + // Remove empty clusters + if (!toRemove.empty()) { + for (auto &RemoveCluster : toRemove) { + auto removeCIter = + std::find(clusterList.begin(), clusterList.end(), RemoveCluster); + if (std::find(clusterList.begin(), clusterList.end(), RemoveCluster) != + clusterList.end()) + this->clusterList.erase(removeCIter); + } + } + // Adjust remaining clusters as necessary, index = label = order + for (auto &cluster : this->clusterList) { + auto clusterIter = + std::find(clusterList.begin(), clusterList.end(), cluster); + int index = (int)std::distance(clusterList.begin(), clusterIter); + cluster.clusterLabel = index; + unordered_map removeGrids; + for (auto &gridOfCluster : cluster.grids) { + DensityGrid grid = gridOfCluster.first; + if (gridList.find(grid) != gridList.end()) + gridList.find(grid)->second.label = index; + else + removeGrids.insert(gridOfCluster); + } + for (auto &grid : removeGrids) { + if (cluster.grids.find(grid.first) != cluster.grids.end()) + cluster.grids.erase(grid.first); + } + this->clusterList.at(index) = cluster; + } } /** @@ -923,91 +841,79 @@ void SESAME::V16::cleanClusters() b. Else i. If (S1 && S2), mark as sporadic */ -void SESAME::V16::removeSporadic() -{ - // SESAME_INFO("REMOVE SPORADIC CALLED"); - // For each grid g in grid_list - - HashMap newGridList; - std::vector removeGridList; - for (auto &gridIter : this->gridList) - { - const DensityGrid &grid = gridIter.first; - CharacteristicVector characteristicVec = gridIter.second; - // If g is sporadic - if (characteristicVec.isSporadic) - { - // If currTime - tg > gap, delete g from grid_list - if (currentTimeStamp - characteristicVec.updateTime >= gap) - { - int gridClass = characteristicVec.label; - - if (gridClass != -1) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == gridClass) - { - gridCluster.removeGrid(grid); - } - } - } - removeGridList.push_back(grid); +void SESAME::V16::removeSporadic() { + // SESAME_INFO("REMOVE SPORADIC CALLED"); + // For each grid g in grid_list + + HashMap newGridList; + std::vector removeGridList; + for (auto &gridIter : this->gridList) { + const DensityGrid &grid = gridIter.first; + CharacteristicVector characteristicVec = gridIter.second; + // If g is sporadic + if (characteristicVec.isSporadic) { + // If currTime - tg > gap, delete g from grid_list + if (currentTimeStamp - characteristicVec.updateTime >= gap) { + int gridClass = characteristicVec.label; + + if (gridClass != -1) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == gridClass) { + gridCluster.removeGrid(grid); } - // Else if (S1 && S2), mark as sporadic - Else mark as normal - else - { - characteristicVec.isSporadic = checkIfSporadic(characteristicVec); - newGridList.insert(std::make_pair(grid, characteristicVec)); - } - } - // Else if (S1 && S2), mark as sporadic - else - { - characteristicVec.isSporadic = checkIfSporadic(characteristicVec); - newGridList.insert(std::make_pair(grid, characteristicVec)); + } } - } - mergeGridList(gridList, newGridList); - - // SESAME_INFO(" - Removed "<gridList.erase(sporadicGrid); - for (auto &cluster : this->clusterList) - { - if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) - { - cluster.grids.erase(sporadicGrid); - } - } - for (auto &cluster : this->newClusterList) - { - if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) - { - cluster.grids.erase(sporadicGrid); - break; - } - } - } + removeGridList.push_back(grid); + } + // Else if (S1 && S2), mark as sporadic - Else mark as normal + else { + characteristicVec.isSporadic = checkIfSporadic(characteristicVec); + newGridList.insert(std::make_pair(grid, characteristicVec)); + } + } + // Else if (S1 && S2), mark as sporadic + else { + characteristicVec.isSporadic = checkIfSporadic(characteristicVec); + newGridList.insert(std::make_pair(grid, characteristicVec)); + } + } + mergeGridList(gridList, newGridList); + + // SESAME_INFO(" - Removed "<gridList.erase(sporadicGrid); + for (auto &cluster : this->clusterList) { + if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) { + cluster.grids.erase(sporadicGrid); + } + } + for (auto &cluster : this->newClusterList) { + if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) { + cluster.grids.erase(sporadicGrid); + break; + } + } + } } /** - * Determines whether a sparse density grid is sporadic using rules S1 and S2 of Chen and Tu 2007 + * Determines whether a sparse density grid is sporadic using rules S1 and S2 of + * Chen and Tu 2007 * - * @param characteristicVec - the CharacteristicVector of the density grid being assessed for - * sporadicity + * @param characteristicVec - the CharacteristicVector of the density grid being + * assessed for sporadicity */ -bool SESAME::V16::checkIfSporadic(CharacteristicVector characteristicVec) -{ - // Check S1 - if (characteristicVec.getCurrGridDensity(currentTimeStamp, param.lambda) < param.outlier_cap) - { - // Check S2 TODO CHANGE REMOVE TIME FROM 0 TO -1 - if (characteristicVec.removeTime == 0 || - (currentTimeStamp - ((1 + param.beta) * characteristicVec.removeTime)) >= 0) - return true; - } - - return false; +bool SESAME::V16::checkIfSporadic(CharacteristicVector characteristicVec) { + // Check S1 + if (characteristicVec.getCurrGridDensity(currentTimeStamp, param.lambda) < + param.outlier_cap) { + // Check S2 TODO CHANGE REMOVE TIME FROM 0 TO -1 + if (characteristicVec.removeTime == 0 || + (currentTimeStamp - + ((1 + param.beta) * characteristicVec.removeTime)) >= 0) + return true; + } + + return false; } \ No newline at end of file diff --git a/src/Algorithm/DesignAspect/V9.cpp b/src/Algorithm/DesignAspect/V9.cpp index 9740941e..0968608a 100644 --- a/src/Algorithm/DesignAspect/V9.cpp +++ b/src/Algorithm/DesignAspect/V9.cpp @@ -12,200 +12,180 @@ using namespace std; -SESAME::V9::V9(param_t &cmd_params) -{ - this->param = cmd_params; - param.lambda = 1; - sum_timer.Tick(); - gap = (int)(param.cm - param.cl); - dm = param.cm; - dl = param.cl; - minVals = std::vector(param.dim, DBL_MAX); - maxVals = std::vector(param.dim, DBL_MIN); - Coord = std::vector(param.dim, 0); +SESAME::V9::V9(param_t &cmd_params) { + this->param = cmd_params; + param.lambda = 1; + sum_timer.Tick(); + gap = (int)(param.cm - param.cl); + dm = param.cm; + dl = param.cl; + minVals = std::vector(param.dim, DBL_MAX); + maxVals = std::vector(param.dim, DBL_MIN); + Coord = std::vector(param.dim, 0); } SESAME::V9::~V9() = default; void SESAME::V9::Init() {} -void SESAME::V9::calculateGridCoord(PointPtr point) -{ - for (int i = 0; i < param.dim; i++) - { - auto feature = point->getFeatureItem(i); - if (feature > maxVals[i]) - { - maxVals[i] = feature; - } - else if (feature < minVals[i]) - { - minVals[i] = feature; - } - Coord[i] = point->getFeatureItem(i) / param.grid_width; +void SESAME::V9::calculateGridCoord(PointPtr point) { + for (int i = 0; i < param.dim; i++) { + auto feature = point->getFeatureItem(i); + if (feature > maxVals[i]) { + maxVals[i] = feature; + } else if (feature < minVals[i]) { + minVals[i] = feature; } + Coord[i] = point->getFeatureItem(i) / param.grid_width; + } } -void SESAME::V9::RunOnline(PointPtr input) -{ - win_timer.Tick(); - this->currentTimeStamp = input->index; - if (input->getIndex() != 0 and input->getIndex() % param.landmark == 0) - { - lastLandmark = input->getIndex(); - for (auto iter = 0; iter != this->clusterList.size(); iter++) - { - PointPtr point = DataStructureFactory::createPoint(iter, 0, param.dim, 0); - auto count = 0; - for (auto &iterGrid : this->clusterList.at(iter).grids) - { - for (int iterDim = 0; iterDim < param.dim; iterDim++) - { - if (count == 0) point->setFeatureItem(0, iterDim); - point->setFeatureItem( - point->getFeatureItem(iterDim) + iterGrid.first.coordinates[iterDim], - iterDim); - if (count == this->clusterList.at(iter).grids.size() - 1) - { - point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, iterDim); - } - } - double weight = gridList.find(iterGrid.first)->second.gridDensity; - point->setWeight(point->getWeight() + weight); - count++; - } - onlineCenters.push_back(point); +void SESAME::V9::RunOnline(PointPtr input) { + win_timer.Tick(); + this->currentTimeStamp = input->index; + if (input->getIndex() != 0 and input->getIndex() % param.landmark == 0) { + lastLandmark = input->getIndex(); + for (auto iter = 0; iter != this->clusterList.size(); iter++) { + PointPtr point = GenericFactory::New(param.dim, iter); + auto count = 0; + for (auto &iterGrid : this->clusterList.at(iter).grids) { + for (int iterDim = 0; iterDim < param.dim; iterDim++) { + if (count == 0) + point->setFeatureItem(0, iterDim); + point->setFeatureItem(point->getFeatureItem(iterDim) + + iterGrid.first.coordinates[iterDim], + iterDim); + if (count == this->clusterList.at(iter).grids.size() - 1) { + point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, + iterDim); + } } - // clean the old clustering information and reinitial every middle variable - minVals = std::vector(param.dim, DBL_MAX); - maxVals = std::vector(param.dim, DBL_MIN); - clusterList = std::vector(); - newClusterList = std::vector(); - Coord = std::vector(param.dim, 0); - gap = max(1.0, param.cm - param.cl); - gridList = HashMap(); - init = false; - } - win_timer.Tock(); - ds_timer.Tick(); - calculateGridCoord(input); - GridListUpdate(Coord); // tempCoord - if (!init && (currentTimeStamp - lastLandmark) == gap) - { - initialClustering(); - init = true; + double weight = gridList.find(iterGrid.first)->second.gridDensity; + point->setWeight(point->getWeight() + weight); + count++; + } + onlineCenters.push_back(point); } - if (currentTimeStamp != lastLandmark and (currentTimeStamp - lastLandmark) % gap == 0) - { - removeSporadic(); - adjustClustering(); - } - ds_timer.Tock(); - lat_timer.Add(input->toa); + // clean the old clustering information and reinitial every middle variable + minVals = std::vector(param.dim, DBL_MAX); + maxVals = std::vector(param.dim, DBL_MIN); + clusterList = std::vector(); + newClusterList = std::vector(); + Coord = std::vector(param.dim, 0); + gap = max(1.0, param.cm - param.cl); + gridList = HashMap(); + init = false; + } + win_timer.Tock(); + ds_timer.Tick(); + calculateGridCoord(input); + GridListUpdate(Coord); // tempCoord + if (!init && (currentTimeStamp - lastLandmark) == gap) { + initialClustering(); + init = true; + } + if (currentTimeStamp != lastLandmark and + (currentTimeStamp - lastLandmark) % gap == 0) { + removeSporadic(); + adjustClustering(); + } + ds_timer.Tock(); + lat_timer.Add(input->toa); } -void SESAME::V9::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - int cluID = 0; - for (const auto &point : onlineCenters) - { - point->setClusteringCenter(cluID++); - sinkPtr->put(point); - } - for (auto iter = 0; iter != this->clusterList.size(); iter++) - { - PointPtr point = DataStructureFactory::createPoint(iter, 0, param.dim, 0); - auto count = 0; - for (auto &iterGrid : this->clusterList.at(iter).grids) - { - for (int iterDim = 0; iterDim < param.dim; iterDim++) - { - if (count == 0) point->setFeatureItem(0, iterDim); - point->setFeatureItem( - point->getFeatureItem(iterDim) + iterGrid.first.coordinates[iterDim], iterDim); - if (count == this->clusterList.at(iter).grids.size() - 1) - { - point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, iterDim); - } - } - double weight = gridList.find(iterGrid.first)->second.gridDensity; - point->setWeight(point->getWeight() + weight); - count++; +void SESAME::V9::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + int cluID = 0; + for (const auto &point : onlineCenters) { + point->setClusteringCenter(cluID++); + sinkPtr->put(point); + } + for (auto iter = 0; iter != this->clusterList.size(); iter++) { + PointPtr point = GenericFactory::New(param.dim, iter); + auto count = 0; + for (auto &iterGrid : this->clusterList.at(iter).grids) { + for (int iterDim = 0; iterDim < param.dim; iterDim++) { + if (count == 0) + point->setFeatureItem(0, iterDim); + point->setFeatureItem(point->getFeatureItem(iterDim) + + iterGrid.first.coordinates[iterDim], + iterDim); + if (count == this->clusterList.at(iter).grids.size() - 1) { + point->setFeatureItem(point->getFeatureItem(iterDim) / param.dim, + iterDim); } - point->setClusteringCenter(cluID++); - sinkPtr->put(point); - } - ref_timer.Tock(); - sum_timer.Tock(); + } + double weight = gridList.find(iterGrid.first)->second.gridDensity; + point->setWeight(point->getWeight() + weight); + count++; + } + point->setClusteringCenter(cluID++); + sinkPtr->put(point); + } + ref_timer.Tock(); + sum_timer.Tock(); } /* Update the grid list of V9 when data inserting into the grid * */ -void SESAME::V9::GridListUpdate(const std::vector &coordinate) -{ - CharacteristicVector characteristicVec; - DensityGrid grid(coordinate); - // 3. If (g not in grid_list) insert dg to grid_list - auto it = this->gridList.find(grid); - q = this->gridList.size(); - if (it == gridList.end()) - { - characteristicVec = CharacteristicVector(currentTimeStamp, 0, 1.0, -1, false, dl, dm); - this->gridList.insert(std::make_pair(grid, characteristicVec)); - } - // 4. Update the characteristic vector of dg - else - { - it->second.densityWithNew(currentTimeStamp, param.lambda); - it->second.updateTime = currentTimeStamp; - } +void SESAME::V9::GridListUpdate(const std::vector &coordinate) { + CharacteristicVector characteristicVec; + DensityGrid grid(coordinate); + // 3. If (g not in grid_list) insert dg to grid_list + auto it = this->gridList.find(grid); + q = this->gridList.size(); + if (it == gridList.end()) { + characteristicVec = + CharacteristicVector(currentTimeStamp, 0, 1.0, -1, false, dl, dm); + this->gridList.insert(std::make_pair(grid, characteristicVec)); + } + // 4. Update the characteristic vector of dg + else { + it->second.densityWithNew(currentTimeStamp, param.lambda); + it->second.updateTime = currentTimeStamp; + } } /** * Implements the procedure given in Figure 3 of Chen and Tu 2007 */ -void SESAME::V9::initialClustering() -{ - // 1. Update the density of all grids in grid_list - // Timer: online grid - updateGridListDensity(); - // 2. Assign each dense grid to a distinct cluster - // and - // 3. Label all other grids as NO_CLASS - auto gridIter = this->gridList.begin(); - HashMap newGridList; - while (gridIter != gridList.end()) - { - DensityGrid grid = gridIter->first; - CharacteristicVector characteristicVecOfG = gridIter->second; - if (characteristicVecOfG.attribute == DENSE) - { - int gridClass = this->clusterList.size(); - characteristicVecOfG.label = gridClass; - GridCluster gridCluster = GridCluster(gridClass); - gridCluster.addGrid(grid); - this->clusterList.push_back(gridCluster); - // // SESAME_INFO(" was dense (class "<clusterList.size()); - } - else - characteristicVecOfG.label = NO_CLASS; - newGridList.insert(std::make_pair(grid, characteristicVecOfG)); - ++gridIter; - } - this->gridList = newGridList; - // 4. Make changes to grid labels by doing: - // a. For each cluster c - // b. For each outside grid g of c - // c. For each neighbouring grid h of g - // d. If h belongs to c', label c and c' with - // the label of the largest cluster - // e. Else if h is transitional, assign it to c - // f. While changes can be made - while (adjustLabels()) - ; // while changes are being made +void SESAME::V9::initialClustering() { + // 1. Update the density of all grids in grid_list + // Timer: online grid + updateGridListDensity(); + // 2. Assign each dense grid to a distinct cluster + // and + // 3. Label all other grids as NO_CLASS + auto gridIter = this->gridList.begin(); + HashMap newGridList; + while (gridIter != gridList.end()) { + DensityGrid grid = gridIter->first; + CharacteristicVector characteristicVecOfG = gridIter->second; + if (characteristicVecOfG.attribute == DENSE) { + int gridClass = this->clusterList.size(); + characteristicVecOfG.label = gridClass; + GridCluster gridCluster = GridCluster(gridClass); + gridCluster.addGrid(grid); + this->clusterList.push_back(gridCluster); + // // SESAME_INFO(" was dense (class "<clusterList.size()); + } else + characteristicVecOfG.label = NO_CLASS; + newGridList.insert(std::make_pair(grid, characteristicVecOfG)); + ++gridIter; + } + this->gridList = newGridList; + // 4. Make changes to grid labels by doing: + // a. For each cluster c + // b. For each outside grid g of c + // c. For each neighbouring grid h of g + // d. If h belongs to c', label c and c' with + // the label of the largest cluster + // e. Else if h is transitional, assign it to c + // f. While changes can be made + while (adjustLabels()) + ; // while changes are being made } /** * Makes first change available to it by following the steps: @@ -216,87 +196,77 @@ void SESAME::V9::initialClustering() * Else if h is transitional, assign it to c * @return TRUE if a change was made to any cluster's labels, FALSE otherwise */ -bool SESAME::V9::adjustLabels() -{ - // bool adjust=false; - // a. For each cluster c - for (GridCluster &gridCluster : this->clusterList) - { - // // SESAME_INFO("Adjusting from cluster "<first; - bool inside = gridIter->second; - // // SESAME_INFO(" Inspecting density grid, grid, standby..."); - - // b. for each OUTSIDE grid of cluster - if (!inside) - { - // // SESAME_INFO(" Density grid dg is outside!"); - // c. for each neighbouring grid, of current iter grid - for (const DensityGrid &gridNeighbourhood : grid.getNeighbours()) - { - auto it2 = this->gridList.find(gridNeighbourhood); - if (it2 != gridList.end()) - { - auto it1 = this->gridList.find(grid); - CharacteristicVector characteristicVec1 = it1->second; - CharacteristicVector characteristicVec2 = it2->second; - int class1 = characteristicVec1.label; - int class2 = characteristicVec2.label; - // ...and if neighbouring grid isn't already in the same cluster as - // grid... - if (class1 != class2) - { - // If neighbouring grid is in cluster c', merge c and c' into the - // larger of the two - if (class2 != NO_CLASS) - { - if (this->clusterList.at(class1).grids.size() < - this->clusterList.at(class2).grids.size()) - mergeClusters(class1, class2); - else - mergeClusters(class2, class1); - return true; - } - // If gridNeighbourhood is transitional and 'outside' of the - // cluster, assign it to cluster - else if (characteristicVec2.isTransitional(dm, dl)) - { - characteristicVec2.label = class1; - gridCluster.addGrid(gridNeighbourhood); - this->clusterList.at(class1) = gridCluster; - if (it1 != gridList.end()) - it1->second = characteristicVec2; - else - this->gridList.insert( - std::make_pair(grid, characteristicVec2)); - return true; - } - } - } - } +bool SESAME::V9::adjustLabels() { + // bool adjust=false; + // a. For each cluster c + for (GridCluster &gridCluster : this->clusterList) { + // // SESAME_INFO("Adjusting from cluster "<first; + bool inside = gridIter->second; + // // SESAME_INFO(" Inspecting density grid, grid, standby..."); + + // b. for each OUTSIDE grid of cluster + if (!inside) { + // // SESAME_INFO(" Density grid dg is outside!"); + // c. for each neighbouring grid, of current iter grid + for (const DensityGrid &gridNeighbourhood : grid.getNeighbours()) { + auto it2 = this->gridList.find(gridNeighbourhood); + if (it2 != gridList.end()) { + auto it1 = this->gridList.find(grid); + CharacteristicVector characteristicVec1 = it1->second; + CharacteristicVector characteristicVec2 = it2->second; + int class1 = characteristicVec1.label; + int class2 = characteristicVec2.label; + // ...and if neighbouring grid isn't already in the same cluster + // as grid... + if (class1 != class2) { + // If neighbouring grid is in cluster c', merge c and c' into + // the larger of the two + if (class2 != NO_CLASS) { + if (this->clusterList.at(class1).grids.size() < + this->clusterList.at(class2).grids.size()) + mergeClusters(class1, class2); + else + mergeClusters(class2, class1); + return true; + } + // If gridNeighbourhood is transitional and 'outside' of the + // cluster, assign it to cluster + else if (characteristicVec2.isTransitional(dm, dl)) { + characteristicVec2.label = class1; + gridCluster.addGrid(gridNeighbourhood); + this->clusterList.at(class1) = gridCluster; + if (it1 != gridList.end()) + it1->second = characteristicVec2; + else + this->gridList.insert( + std::make_pair(grid, characteristicVec2)); + return true; } + } } + } } + } } - return false; + } + return false; } /** - * Iterates through grid_list and updates the density for each density grid therein. - * Also marks each density grid as unvisited for this call to adjustClustering. + * Iterates through grid_list and updates the density for each density grid + * therein. Also marks each density grid as unvisited for this call to + * adjustClustering. */ -void SESAME::V9::updateGridListDensity() -{ - for (auto &iter : this->gridList) - { - iter.second.isVisited = false; - iter.second.UpdateAllDensity(currentTimeStamp, param.lambda, dl, dm); - } +void SESAME::V9::updateGridListDensity() { + for (auto &iter : this->gridList) { + iter.second.isVisited = false; + iter.second.UpdateAllDensity(currentTimeStamp, param.lambda, dl, dm); + } } /** @@ -305,66 +275,65 @@ void SESAME::V9::updateGridListDensity() * * @see moa.clusterers.V9.V9#gap */ -void SESAME::V9::adjustClustering() -{ - // SESAME_INFO("ADJUST CLUSTERING CALLED "); - // 1. Update the density of all grids in grid_list - updateGridListDensity(); - // 2. For each grid dg whose attribute is changed since last call - // a. If dg is sparse - // b. If dg is dense - // c. If dg is transitional - while (inspectChangedGrids()) - ; +void SESAME::V9::adjustClustering() { + // SESAME_INFO("ADJUST CLUSTERING CALLED "); + // 1. Update the density of all grids in grid_list + updateGridListDensity(); + // 2. For each grid dg whose attribute is changed since last call + // a. If dg is sparse + // b. If dg is dense + // c. If dg is transitional + while (inspectChangedGrids()) + ; } /** - * Inspects each density grid in grid_list whose attribute has changed since the last - * call to adjustClustering. Implements lines 3/4/7/19 of the procedure given in Figure - * 4 of Chen and Tu 2007. + * Inspects each density grid in grid_list whose attribute has changed since the + * last call to adjustClustering. Implements lines 3/4/7/19 of the procedure + * given in Figure 4 of Chen and Tu 2007. * * @return TRUE if any grids are updated; FALSE otherwise. */ -bool SESAME::V9::inspectChangedGrids() -{ - HashMap newGridList; - auto gridIter = this->gridList.begin(); - int a = 0; - while (gridIter != gridList.end()) //&& newGridList.empty() - { - const DensityGrid &grid = gridIter->first; - const CharacteristicVector &characteristicVec = gridIter->second; - int gridClass = characteristicVec.label; - if (characteristicVec.attChange && !characteristicVec.isVisited) // grid.isVisited - { // grid.isVisited=true; - gridIter->second.isVisited = true; - newGridList.insert(std::make_pair(grid, characteristicVec)); - if (characteristicVec.attribute == SPARSE) - mergeGridList(newGridList, adjustForSparseGrid(grid, characteristicVec, gridClass)); - else if (characteristicVec.attribute == DENSE) - mergeGridList(newGridList, adjustForDenseGrid(grid, characteristicVec, gridClass)); - else // TRANSITIONAL - mergeGridList(newGridList, - adjustForTransitionalGrid(grid, characteristicVec, gridClass)); - } - ++gridIter; - a++; - } - // If there are grids in new grid list, update the corresponding grids in grid_list and clean up - // the cluster list - if (!newGridList.empty()) - { - mergeGridList(this->gridList, newGridList); - cleanClusters(); - return true; - } - else - return false; +bool SESAME::V9::inspectChangedGrids() { + HashMap newGridList; + auto gridIter = this->gridList.begin(); + int a = 0; + while (gridIter != gridList.end()) //&& newGridList.empty() + { + const DensityGrid &grid = gridIter->first; + const CharacteristicVector &characteristicVec = gridIter->second; + int gridClass = characteristicVec.label; + if (characteristicVec.attChange && + !characteristicVec.isVisited) // grid.isVisited + { // grid.isVisited=true; + gridIter->second.isVisited = true; + newGridList.insert(std::make_pair(grid, characteristicVec)); + if (characteristicVec.attribute == SPARSE) + mergeGridList(newGridList, + adjustForSparseGrid(grid, characteristicVec, gridClass)); + else if (characteristicVec.attribute == DENSE) + mergeGridList(newGridList, + adjustForDenseGrid(grid, characteristicVec, gridClass)); + else // TRANSITIONAL + mergeGridList(newGridList, adjustForTransitionalGrid( + grid, characteristicVec, gridClass)); + } + ++gridIter; + a++; + } + // If there are grids in new grid list, update the corresponding grids in + // grid_list and clean up the cluster list + if (!newGridList.empty()) { + mergeGridList(this->gridList, newGridList); + cleanClusters(); + return true; + } else + return false; } /** - * Adjusts the clustering of a sparse density grid. Implements lines 5 and 6 from Figure 4 of Chen - * and Tu 2007. + * Adjusts the clustering of a sparse density grid. Implements lines 5 and 6 + * from Figure 4 of Chen and Tu 2007. * * @param dg the sparse density grid being adjusted * @param cv the characteristic vector of dg @@ -372,153 +341,139 @@ bool SESAME::V9::inspectChangedGrids() * * @return a HashMap containing density grids for update after this iteration */ -SESAME::HashMap SESAME::V9::adjustForSparseGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - HashMap newGridList; - if (gridClass != NO_CLASS) - { - //// SESAME_INFO("It is removed from cluster "< containing density grids for update after - * this iteration + * @return a HashMap containing density grids + * for update after this iteration */ -SESAME::HashMap SESAME::V9::reCluster(GridCluster &gridCluster) -{ - // SESAME_INFO("Now re-cluster!"); - HashMap newGridList; - auto gcIter = gridCluster.grids.begin(); - // // SESAME_INFO("ReCluster called for cluster "<first; - CharacteristicVector characteristicVecOfGrid = this->gridList.find(grid)->second; - if (characteristicVecOfGrid.attribute == DENSE) - { - int gridClass = (int)newClusterList.size(); - characteristicVecOfGrid.label = gridClass; - GridCluster newCluster(gridClass); - newCluster.addGrid(grid); - newClusterList.push_back(newCluster); - } - else - characteristicVecOfGrid.label = NO_CLASS; - newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); - gcIter++; - } - - bool changesMade; - // While changes can be made... - do - { - changesMade = false; - HashMap gridListAdjusted = adjustNewLabels(newGridList); - if (!gridListAdjusted.empty()) - { - // SESAME_INFO("grid list is adjusted for sparse!"); - mergeGridList(newGridList, gridListAdjusted); - changesMade = true; - } - } while (changesMade); - - // Update the cluster list with the newly formed clusters - gridCluster.grids.clear(); - this->clusterList.at(gridCluster.clusterLabel) = gridCluster; - for (GridCluster &cluster : newClusterList) this->clusterList.push_back(cluster); - return newGridList; +SESAME::HashMap SESAME::V9::reCluster(GridCluster &gridCluster) { + // SESAME_INFO("Now re-cluster!"); + HashMap newGridList; + auto gcIter = gridCluster.grids.begin(); + // // SESAME_INFO("ReCluster called for cluster "<first; + CharacteristicVector characteristicVecOfGrid = + this->gridList.find(grid)->second; + if (characteristicVecOfGrid.attribute == DENSE) { + int gridClass = (int)newClusterList.size(); + characteristicVecOfGrid.label = gridClass; + GridCluster newCluster(gridClass); + newCluster.addGrid(grid); + newClusterList.push_back(newCluster); + } else + characteristicVecOfGrid.label = NO_CLASS; + newGridList.insert(std::make_pair(grid, characteristicVecOfGrid)); + gcIter++; + } + + bool changesMade; + // While changes can be made... + do { + changesMade = false; + HashMap gridListAdjusted = adjustNewLabels(newGridList); + if (!gridListAdjusted.empty()) { + // SESAME_INFO("grid list is adjusted for sparse!"); + mergeGridList(newGridList, gridListAdjusted); + changesMade = true; + } + } while (changesMade); + + // Update the cluster list with the newly formed clusters + gridCluster.grids.clear(); + this->clusterList.at(gridCluster.clusterLabel) = gridCluster; + for (GridCluster &cluster : newClusterList) + this->clusterList.push_back(cluster); + return newGridList; } -SESAME::HashMap SESAME::V9::adjustNewLabels(const SESAME::HashMap &newGridList) -{ - HashMap gridListAdjusted; - // a. For each cluster c - for (GridCluster &gridCluster : newClusterList) - { - for (auto &gridIter : gridCluster.grids) - { - DensityGrid grid = gridIter.first; - bool inside = gridIter.second; - - // b. for each OUTSIDE grid, dg, of c - if (!inside) - { - // c. for each neighbouring grid, neighbourGrid, of dg - - for (DensityGrid &neighbourGrid : grid.getNeighbours()) - { - if (newGridList.find(neighbourGrid) != newGridList.end()) - { - CharacteristicVector characteristicVec1 = - newGridList.find(neighbourGrid)->second; - CharacteristicVector characteristicVec2 = - newGridList.find(neighbourGrid)->second; - int class1 = characteristicVec1.label; - int class2 = characteristicVec2.label; - - // ...and if neighbourGrid isn't already in the same cluster as dg... - if (class1 != class2) - { - GridCluster cluster1 = newClusterList.at(class1); - // If dgprime is in cluster c', merge c and c' into the larger of the - // two - if (class2 != NO_CLASS) - { - GridCluster cluster2 = newClusterList.at(class2); - // System.out.println("C is "+class1+" and C' is "+class2+"."); - if (cluster1.grids.size() < cluster2.grids.size()) - mergeGridList(gridListAdjusted, - mergeNewClusters(newGridList, class1, class2)); - else - mergeGridList(gridListAdjusted, - mergeNewClusters(newGridList, class2, class1)); - - return gridListAdjusted; - } - // If neighbourGrid is transitional and outside of cluster, assign it to - // cluster - else if (characteristicVec2.isTransitional(dm, dl)) - { - characteristicVec2.label = class1; - cluster1.addGrid(neighbourGrid); - this->newClusterList.at(class1) = cluster1; - gridListAdjusted.insert( - std::make_pair(neighbourGrid, characteristicVec2)); - return gridListAdjusted; - } - } - } - } +SESAME::HashMap +SESAME::V9::adjustNewLabels(const SESAME::HashMap &newGridList) { + HashMap gridListAdjusted; + // a. For each cluster c + for (GridCluster &gridCluster : newClusterList) { + for (auto &gridIter : gridCluster.grids) { + DensityGrid grid = gridIter.first; + bool inside = gridIter.second; + + // b. for each OUTSIDE grid, dg, of c + if (!inside) { + // c. for each neighbouring grid, neighbourGrid, of dg + + for (DensityGrid &neighbourGrid : grid.getNeighbours()) { + if (newGridList.find(neighbourGrid) != newGridList.end()) { + CharacteristicVector characteristicVec1 = + newGridList.find(neighbourGrid)->second; + CharacteristicVector characteristicVec2 = + newGridList.find(neighbourGrid)->second; + int class1 = characteristicVec1.label; + int class2 = characteristicVec2.label; + + // ...and if neighbourGrid isn't already in the same cluster as + // dg... + if (class1 != class2) { + GridCluster cluster1 = newClusterList.at(class1); + // If dgprime is in cluster c', merge c and c' into the larger of + // the two + if (class2 != NO_CLASS) { + GridCluster cluster2 = newClusterList.at(class2); + // System.out.println("C is "+class1+" and C' is "+class2+"."); + if (cluster1.grids.size() < cluster2.grids.size()) + mergeGridList(gridListAdjusted, + mergeNewClusters(newGridList, class1, class2)); + else + mergeGridList(gridListAdjusted, + mergeNewClusters(newGridList, class2, class1)); + + return gridListAdjusted; + } + // If neighbourGrid is transitional and outside of cluster, assign + // it to cluster + else if (characteristicVec2.isTransitional(dm, dl)) { + characteristicVec2.label = class1; + cluster1.addGrid(neighbourGrid); + this->newClusterList.at(class1) = cluster1; + gridListAdjusted.insert( + std::make_pair(neighbourGrid, characteristicVec2)); + return gridListAdjusted; + } } + } } + } } - return gridListAdjusted; + } + return gridListAdjusted; } /** - * Adjusts the clustering of a dense density grid. Implements lines 8 through 18 from Figure 4 of - * Chen and Tu 2007. + * Adjusts the clustering of a dense density grid. Implements lines 8 through 18 + * from Figure 4 of Chen and Tu 2007. * * @param grid the dense density grid being adjusted * @param characteristicVec the characteristic vector of dg @@ -526,241 +481,222 @@ SESAME::HashMap SESAME::V9::adjustNewLabels(const SESAME::HashMap &newGridList) * * @return a HashMapcontaining density grids for update after this iteration */ -SESAME::HashMap SESAME::V9::adjustForDenseGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - // Among all neighbours of dg, find the grid h whose cluster ch has the largest size - GridCluster gridCluster; // The cluster, ch, of h - DensityGrid gridChosen(grid); // The chosen grid h, whose cluster ch has the largest size - double ChosenGridSize = -1.0; // The size of gridCluster, the largest cluster - int hClass = NO_CLASS; // The class label of h - int hChosenClass = NO_CLASS; // The class label of ch - - HashMap newGridList; - //// SESAME_INFO("adjust For Dense Grid "<gridList.find(neighbourGrid) != gridList.end()) - { - hClass = this->gridList.find(neighbourGrid)->second.label; - if (hClass != NO_CLASS) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hClass) - { - if (gridCluster.grids.size() > ChosenGridSize) - { - ChosenGridSize = gridCluster.grids.size(); - hChosenClass = hClass; - gridChosen = DensityGrid(neighbourGrid); - } - } - } +SESAME::HashMap +SESAME::V9::adjustForDenseGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass) { + // Among all neighbours of dg, find the grid h whose cluster ch has the + // largest size + GridCluster gridCluster; // The cluster, ch, of h + DensityGrid gridChosen( + grid); // The chosen grid h, whose cluster ch has the largest size + double ChosenGridSize = -1.0; // The size of gridCluster, the largest cluster + int hClass = NO_CLASS; // The class label of h + int hChosenClass = NO_CLASS; // The class label of ch + + HashMap newGridList; + //// SESAME_INFO("adjust For Dense Grid "<gridList.find(neighbourGrid) != gridList.end()) { + hClass = this->gridList.find(neighbourGrid)->second.label; + if (hClass != NO_CLASS) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hClass) { + if (gridCluster.grids.size() > ChosenGridSize) { + ChosenGridSize = gridCluster.grids.size(); + hChosenClass = hClass; + gridChosen = DensityGrid(neighbourGrid); } + } } + } } + } - if (hChosenClass != NO_CLASS && hChosenClass != gridClass) - { - gridCluster = this->clusterList.at(hChosenClass); - - // If h is a dense grid - if (this->gridList.find(gridChosen)->second.attribute == DENSE) - { - // // SESAME_INFO("h is dense."); - // If dg is labelled as NO_CLASS - if (gridClass == NO_CLASS) - { - // // SESAME_INFO("g was labelled NO_CLASS"); - characteristicVec.label = hChosenClass; - newGridList.insert(std::make_pair(grid, characteristicVec)); - gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - } - // Else if dg belongs to cluster c and h belongs to c' - else - { - // // SESAME_INFO("g was labelled "<clusterList.at(gridClass).grids.size(); - - if (gSize <= ChosenGridSize) - mergeClusters(gridClass, hChosenClass); - else - mergeClusters(hChosenClass, gridClass); - } - } + if (hChosenClass != NO_CLASS && hChosenClass != gridClass) { + gridCluster = this->clusterList.at(hChosenClass); - // Else if h is a transitional grid - else if (this->gridList.at(gridChosen).attribute == TRANSITIONAL) - { - // // SESAME_INFO("h is transitional."); - // If dg is labelled as no class and if h is an outside grid if dg is added to ch - if (gridClass == NO_CLASS && !gridCluster.isInside(gridChosen, grid)) - { - characteristicVec.label = hChosenClass; - newGridList.insert(std::make_pair(grid, characteristicVec)); - gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - // // SESAME_INFO(" dg is added to cluster "<= |ch| - else if (gridClass != NO_CLASS) - { - GridCluster c = this->clusterList.at(gridClass); - double gSize = c.grids.size(); - - if (gSize >= ChosenGridSize) - { - // Move h from cluster ch to cluster c - gridCluster.removeGrid(gridChosen); - c.addGrid(gridChosen); - CharacteristicVector cvhChosen = this->gridList.find(gridChosen)->second; - cvhChosen.label = gridClass; - newGridList.insert(std::make_pair(gridChosen, cvhChosen)); - // // SESAME_INFO("dgClass is "<clusterList.at(hChosenClass) = gridCluster; - this->clusterList.at(gridClass) = c; - } - } - } - } - // If dgClass is dense and not in a cluster, and none if its neighbours are in a cluster, - // put it in its own new cluster and search the neighbourhood for transitional or dense - // grids to add - else if (gridClass == NO_CLASS) - { - int newClass = (int)this->clusterList.size(); - GridCluster c = GridCluster(newClass); - c.addGrid(grid); - // System.out.println("Added "+dg.toString()+" to cluster "+newClass+"."); - this->clusterList.push_back(c); - characteristicVec.label = newClass; - if (newGridList.find(grid) != newGridList.end()) - newGridList.find(grid)->second = characteristicVec; + // If h is a dense grid + if (this->gridList.find(gridChosen)->second.attribute == DENSE) { + // // SESAME_INFO("h is dense."); + // If dg is labelled as NO_CLASS + if (gridClass == NO_CLASS) { + // // SESAME_INFO("g was labelled NO_CLASS"); + characteristicVec.label = hChosenClass; + newGridList.insert(std::make_pair(grid, characteristicVec)); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; + } + // Else if dg belongs to cluster c and h belongs to c' + else { + // // SESAME_INFO("g was labelled "<clusterList.at(gridClass).grids.size(); + + if (gSize <= ChosenGridSize) + mergeClusters(gridClass, hChosenClass); else - newGridList.insert(std::make_pair(grid, characteristicVec)); - // Iterate through the neighbourhood until no more transitional neighbours can be added - // (dense neighbours will add themselves as part of their adjust process) - for (DensityGrid &dghprime : grid.getNeighbours()) - { - if (this->gridList.find(dghprime) != this->gridList.end() && - c.grids.find(dghprime) != c.grids.end()) - { - CharacteristicVector cvhprime = this->gridList.find(dghprime)->second; - if (cvhprime.attribute == TRANSITIONAL) - { - c.addGrid(dghprime); - cvhprime.label = newClass; - newGridList.insert(std::make_pair(dghprime, cvhprime)); - } - } + mergeClusters(hChosenClass, gridClass); + } + } + + // Else if h is a transitional grid + else if (this->gridList.at(gridChosen).attribute == TRANSITIONAL) { + // // SESAME_INFO("h is transitional."); + // If dg is labelled as no class and if h is an outside grid if dg is + // added to ch + if (gridClass == NO_CLASS && !gridCluster.isInside(gridChosen, grid)) { + characteristicVec.label = hChosenClass; + newGridList.insert(std::make_pair(grid, characteristicVec)); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; + // // SESAME_INFO(" dg is added to cluster "<= |ch| + else if (gridClass != NO_CLASS) { + GridCluster c = this->clusterList.at(gridClass); + double gSize = c.grids.size(); + + if (gSize >= ChosenGridSize) { + // Move h from cluster ch to cluster c + gridCluster.removeGrid(gridChosen); + c.addGrid(gridChosen); + CharacteristicVector cvhChosen = + this->gridList.find(gridChosen)->second; + cvhChosen.label = gridClass; + newGridList.insert(std::make_pair(gridChosen, cvhChosen)); + // // SESAME_INFO("dgClass is "<clusterList.at(hChosenClass) = gridCluster; + this->clusterList.at(gridClass) = c; + } + } + } + } + // If dgClass is dense and not in a cluster, and none if its neighbours are in + // a cluster, put it in its own new cluster and search the neighbourhood for + // transitional or dense grids to add + else if (gridClass == NO_CLASS) { + int newClass = (int)this->clusterList.size(); + GridCluster c = GridCluster(newClass); + c.addGrid(grid); + // System.out.println("Added "+dg.toString()+" to cluster "+newClass+"."); + this->clusterList.push_back(c); + characteristicVec.label = newClass; + if (newGridList.find(grid) != newGridList.end()) + newGridList.find(grid)->second = characteristicVec; + else + newGridList.insert(std::make_pair(grid, characteristicVec)); + // Iterate through the neighbourhood until no more transitional neighbours + // can be added (dense neighbours will add themselves as part of their + // adjust process) + for (DensityGrid &dghprime : grid.getNeighbours()) { + if (this->gridList.find(dghprime) != this->gridList.end() && + c.grids.find(dghprime) != c.grids.end()) { + CharacteristicVector cvhprime = this->gridList.find(dghprime)->second; + if (cvhprime.attribute == TRANSITIONAL) { + c.addGrid(dghprime); + cvhprime.label = newClass; + newGridList.insert(std::make_pair(dghprime, cvhprime)); } - this->clusterList.at(newClass) = c; + } } + this->clusterList.at(newClass) = c; + } - return newGridList; + return newGridList; } /** - * Adjusts the clustering of a transitional density grid. Implements lines 20 and 21 from Figure 4 - * of Chen and Tu 2007. + * Adjusts the clustering of a transitional density grid. Implements lines 20 + * and 21 from Figure 4 of Chen and Tu 2007. * * @param dg the dense density grid being adjusted * @param cv the characteristic vector of dg * @param dgClass the cluster to which dg belonged * - * @return a HashMap containing density grids for update after - * this iteration + * @return a HashMap containing density grids + * for update after this iteration */ -SESAME::HashMap SESAME::V9::adjustForTransitionalGrid(const DensityGrid &grid, - CharacteristicVector characteristicVec, - int gridClass) -{ - // Among all neighbours of dg, find the grid h whose cluster ch has the largest size - // and satisfies that dg would be an outside grid if added to it - GridCluster gridCluster; // The cluster, ch, of h - double hChosenSize = 0.0; // The size of ch, the largest cluster - DensityGrid neighbourGrid; // The neighbour of dg being considered - int hClass = NO_CLASS; // The class label of h - int hChosenClass = NO_CLASS; // The class label of ch - HashMap newGridList; - //// SESAME_INFO("adjust For Transitional Grid "<gridList.find(neighbourGrid); - if (it != gridList.end()) - { - hClass = it->second.label; - ; - if (hClass != NO_CLASS) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hClass) - { - if ((gridCluster.grids.size() > hChosenSize) && - !gridCluster.isInside(grid, grid)) - { - hChosenSize = gridCluster.grids.size(); - hChosenClass = hClass; - } - } - } +SESAME::HashMap +SESAME::V9::adjustForTransitionalGrid(const DensityGrid &grid, + CharacteristicVector characteristicVec, + int gridClass) { + // Among all neighbours of dg, find the grid h whose cluster ch has the + // largest size and satisfies that dg would be an outside grid if added to it + GridCluster gridCluster; // The cluster, ch, of h + double hChosenSize = 0.0; // The size of ch, the largest cluster + DensityGrid neighbourGrid; // The neighbour of dg being considered + int hClass = NO_CLASS; // The class label of h + int hChosenClass = NO_CLASS; // The class label of ch + HashMap newGridList; + //// SESAME_INFO("adjust For Transitional Grid "<gridList.find(neighbourGrid); + if (it != gridList.end()) { + hClass = it->second.label; + ; + if (hClass != NO_CLASS) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hClass) { + if ((gridCluster.grids.size() > hChosenSize) && + !gridCluster.isInside(grid, grid)) { + hChosenSize = gridCluster.grids.size(); + hChosenClass = hClass; } + } } + } } + } - if (hChosenClass != NO_CLASS && hChosenClass != gridClass) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == hChosenClass) gridCluster.addGrid(grid); - if (gridCluster.clusterLabel == gridClass and gridClass != NO_CLASS) - gridCluster.removeGrid(grid); - } - gridCluster = this->clusterList.at(hChosenClass); + if (hChosenClass != NO_CLASS && hChosenClass != gridClass) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == hChosenClass) gridCluster.addGrid(grid); - this->clusterList.at(hChosenClass) = gridCluster; - - characteristicVec.label = hChosenClass; - newGridList.insert(std::make_pair(grid, characteristicVec)); + if (gridCluster.clusterLabel == gridClass and gridClass != NO_CLASS) + gridCluster.removeGrid(grid); } + gridCluster = this->clusterList.at(hChosenClass); + gridCluster.addGrid(grid); + this->clusterList.at(hChosenClass) = gridCluster; - return newGridList; -} + characteristicVec.label = hChosenClass; + newGridList.insert(std::make_pair(grid, characteristicVec)); + } -SESAME::HashMap SESAME::V9::mergeNewClusters(SESAME::HashMap newGridList, int smallCluster, - int bigCluster) -{ - // System.out.println("Merge new clusters "+smallCluster+" and "+bigCluster+"."); - // Iterate through the density grids in grid_list to find those which are in highClass - for (HashMap::iterator gridIter = newGridList.begin(); gridIter != gridList.end(); gridIter++) - { - DensityGrid grid = gridIter->first; - CharacteristicVector characteristicVec = gridIter->second; - - // Assign density grids in small Cluster to bigCluster - if (characteristicVec.label == smallCluster) - { - characteristicVec.label = bigCluster; - newGridList.insert(std::make_pair(grid, characteristicVec)); - } - } - // SESAME_INFO("Density grids assigned to cluster "<newClusterList.at(bigCluster); - bGC.absorbCluster(this->newClusterList.at(smallCluster)); - this->newClusterList.at(bigCluster) = bGC; - this->newClusterList.erase(this->newClusterList.begin() + smallCluster); - // System.out.println("Cluster "+smallClus+" removed from list."); - newGridList = cleanNewClusters(newGridList); + return newGridList; +} - return newGridList; +SESAME::HashMap SESAME::V9::mergeNewClusters(SESAME::HashMap newGridList, + int smallCluster, int bigCluster) { + // System.out.println("Merge new clusters "+smallCluster+" and + // "+bigCluster+"."); Iterate through the density grids in grid_list to find + // those which are in highClass + for (HashMap::iterator gridIter = newGridList.begin(); + gridIter != gridList.end(); gridIter++) { + DensityGrid grid = gridIter->first; + CharacteristicVector characteristicVec = gridIter->second; + + // Assign density grids in small Cluster to bigCluster + if (characteristicVec.label == smallCluster) { + characteristicVec.label = bigCluster; + newGridList.insert(std::make_pair(grid, characteristicVec)); + } + } + // SESAME_INFO("Density grids assigned to cluster "<newClusterList.at(bigCluster); + bGC.absorbCluster(this->newClusterList.at(smallCluster)); + this->newClusterList.at(bigCluster) = bGC; + this->newClusterList.erase(this->newClusterList.begin() + smallCluster); + // System.out.println("Cluster "+smallClus+" removed from list."); + newGridList = cleanNewClusters(newGridList); + + return newGridList; } /** @@ -770,133 +706,122 @@ SESAME::HashMap SESAME::V9::mergeNewClusters(SESAME::HashMap newGridList, int sm * @param smallCluster - the index of the smaller cluster * @param bigCluster - the index of the bigger cluster */ -void SESAME::V9::mergeClusters(int smallCluster, int bigCluster) -{ - // SESAME_INFO("Merge clusters "<second.label == smallCluster) - { - gridIter->second.label = bigCluster; - } - } - // SESAME_INFO("Density grids assigned to cluster "<clusterList.at(bigCluster); - bigGridCluster.absorbCluster(this->clusterList.at(smallCluster)); - this->clusterList.at(bigCluster) = bigGridCluster; - this->clusterList.erase(clusterList.begin() + smallCluster); - // SESAME_INFO("Cluster "<second.label == smallCluster) { + gridIter->second.label = bigCluster; + } + } + // SESAME_INFO("Density grids assigned to cluster "<clusterList.at(bigCluster); + bigGridCluster.absorbCluster(this->clusterList.at(smallCluster)); + this->clusterList.at(bigCluster) = bigGridCluster; + this->clusterList.erase(clusterList.begin() + smallCluster); + // SESAME_INFO("Cluster "< toRemove; - // Check to see if there are any empty clusters - for (auto &cluster : this->newClusterList) - { - if (cluster.grids.empty()) toRemove.push_back(cluster); - } - - // Remove empty clusters - if (!toRemove.empty()) - { - for (auto &RemoveCluster : toRemove) - { - auto removeCIter = - std::find(newClusterList.begin(), newClusterList.end(), RemoveCluster); - if (std::find(newClusterList.begin(), newClusterList.end(), RemoveCluster) != - newClusterList.end()) - this->newClusterList.erase(removeCIter); - } - } - for (auto &cluster : this->newClusterList) - { - auto clusterIter = std::find(newClusterList.begin(), newClusterList.end(), cluster); - int index = (int)std::distance(newClusterList.begin(), clusterIter); - cluster.clusterLabel = index; - unordered_map removeGrids; - for (auto &gridOfCluster : cluster.grids) - { - DensityGrid grid = gridOfCluster.first; - ; - if (newGridList.find(grid) != newGridList.end()) - newGridList.find(grid)->second.label = index; - else - removeGrids.insert(gridOfCluster); - } - for (auto &grid : removeGrids) - { - if (cluster.grids.find(grid.first) != cluster.grids.end()) - cluster.grids.erase(grid.first); - } - } - // SESAME_INFO("Clean finish!"); - return newGridList; +SESAME::HashMap SESAME::V9::cleanNewClusters(SESAME::HashMap newGridList) { + std::vector toRemove; + // Check to see if there are any empty clusters + for (auto &cluster : this->newClusterList) { + if (cluster.grids.empty()) + toRemove.push_back(cluster); + } + + // Remove empty clusters + if (!toRemove.empty()) { + for (auto &RemoveCluster : toRemove) { + auto removeCIter = std::find(newClusterList.begin(), newClusterList.end(), + RemoveCluster); + if (std::find(newClusterList.begin(), newClusterList.end(), + RemoveCluster) != newClusterList.end()) + this->newClusterList.erase(removeCIter); + } + } + for (auto &cluster : this->newClusterList) { + auto clusterIter = + std::find(newClusterList.begin(), newClusterList.end(), cluster); + int index = (int)std::distance(newClusterList.begin(), clusterIter); + cluster.clusterLabel = index; + unordered_map removeGrids; + for (auto &gridOfCluster : cluster.grids) { + DensityGrid grid = gridOfCluster.first; + ; + if (newGridList.find(grid) != newGridList.end()) + newGridList.find(grid)->second.label = index; + else + removeGrids.insert(gridOfCluster); + } + for (auto &grid : removeGrids) { + if (cluster.grids.find(grid.first) != cluster.grids.end()) + cluster.grids.erase(grid.first); + } + } + // SESAME_INFO("Clean finish!"); + return newGridList; } // TODO stop right here and confused about get() /** - * Iterates through cluster_list to ensure that all empty clusters have been removed and - * that all cluster IDs match the cluster's index in cluster_list. + * Iterates through cluster_list to ensure that all empty clusters have been + * removed and that all cluster IDs match the cluster's index in cluster_list. */ -void SESAME::V9::cleanClusters() -{ - //// SESAME_INFO("Clean Clusters"); - - std::vector toRemove; - - // Check to see if there are any empty clusters - for (auto &cluster : this->clusterList) - { - if (cluster.grids.empty()) toRemove.push_back(cluster); - } - // Remove empty clusters - if (!toRemove.empty()) - { - for (auto &RemoveCluster : toRemove) - { - auto removeCIter = std::find(clusterList.begin(), clusterList.end(), RemoveCluster); - if (std::find(clusterList.begin(), clusterList.end(), RemoveCluster) != - clusterList.end()) - this->clusterList.erase(removeCIter); - } - } - // Adjust remaining clusters as necessary, index = label = order - for (auto &cluster : this->clusterList) - { - auto clusterIter = std::find(clusterList.begin(), clusterList.end(), cluster); - int index = (int)std::distance(clusterList.begin(), clusterIter); - cluster.clusterLabel = index; - unordered_map removeGrids; - for (auto &gridOfCluster : cluster.grids) - { - DensityGrid grid = gridOfCluster.first; - if (gridList.find(grid) != gridList.end()) - gridList.find(grid)->second.label = index; - else - removeGrids.insert(gridOfCluster); - } - for (auto &grid : removeGrids) - { - if (cluster.grids.find(grid.first) != cluster.grids.end()) - cluster.grids.erase(grid.first); - } - this->clusterList.at(index) = cluster; - } +void SESAME::V9::cleanClusters() { + //// SESAME_INFO("Clean Clusters"); + + std::vector toRemove; + + // Check to see if there are any empty clusters + for (auto &cluster : this->clusterList) { + if (cluster.grids.empty()) + toRemove.push_back(cluster); + } + // Remove empty clusters + if (!toRemove.empty()) { + for (auto &RemoveCluster : toRemove) { + auto removeCIter = + std::find(clusterList.begin(), clusterList.end(), RemoveCluster); + if (std::find(clusterList.begin(), clusterList.end(), RemoveCluster) != + clusterList.end()) + this->clusterList.erase(removeCIter); + } + } + // Adjust remaining clusters as necessary, index = label = order + for (auto &cluster : this->clusterList) { + auto clusterIter = + std::find(clusterList.begin(), clusterList.end(), cluster); + int index = (int)std::distance(clusterList.begin(), clusterIter); + cluster.clusterLabel = index; + unordered_map removeGrids; + for (auto &gridOfCluster : cluster.grids) { + DensityGrid grid = gridOfCluster.first; + if (gridList.find(grid) != gridList.end()) + gridList.find(grid)->second.label = index; + else + removeGrids.insert(gridOfCluster); + } + for (auto &grid : removeGrids) { + if (cluster.grids.find(grid.first) != cluster.grids.end()) + cluster.grids.erase(grid.first); + } + this->clusterList.at(index) = cluster; + } } /** @@ -909,91 +834,79 @@ void SESAME::V9::cleanClusters() b. Else i. If (S1 && S2), mark as sporadic */ -void SESAME::V9::removeSporadic() -{ - // SESAME_INFO("REMOVE SPORADIC CALLED"); - // For each grid g in grid_list - - HashMap newGridList; - std::vector removeGridList; - for (auto &gridIter : this->gridList) - { - const DensityGrid &grid = gridIter.first; - CharacteristicVector characteristicVec = gridIter.second; - // If g is sporadic - if (characteristicVec.isSporadic) - { - // If currTime - tg > gap, delete g from grid_list - if (currentTimeStamp - characteristicVec.updateTime >= gap) - { - int gridClass = characteristicVec.label; - - if (gridClass != -1) - { - for (auto gridCluster : clusterList) - { - if (gridCluster.clusterLabel == gridClass) - { - gridCluster.removeGrid(grid); - } - } - } - removeGridList.push_back(grid); +void SESAME::V9::removeSporadic() { + // SESAME_INFO("REMOVE SPORADIC CALLED"); + // For each grid g in grid_list + + HashMap newGridList; + std::vector removeGridList; + for (auto &gridIter : this->gridList) { + const DensityGrid &grid = gridIter.first; + CharacteristicVector characteristicVec = gridIter.second; + // If g is sporadic + if (characteristicVec.isSporadic) { + // If currTime - tg > gap, delete g from grid_list + if (currentTimeStamp - characteristicVec.updateTime >= gap) { + int gridClass = characteristicVec.label; + + if (gridClass != -1) { + for (auto gridCluster : clusterList) { + if (gridCluster.clusterLabel == gridClass) { + gridCluster.removeGrid(grid); } - // Else if (S1 && S2), mark as sporadic - Else mark as normal - else - { - characteristicVec.isSporadic = checkIfSporadic(characteristicVec); - newGridList.insert(std::make_pair(grid, characteristicVec)); - } - } - // Else if (S1 && S2), mark as sporadic - else - { - characteristicVec.isSporadic = checkIfSporadic(characteristicVec); - newGridList.insert(std::make_pair(grid, characteristicVec)); + } } - } - mergeGridList(gridList, newGridList); - - // SESAME_INFO(" - Removed "<gridList.erase(sporadicGrid); - for (auto &cluster : this->clusterList) - { - if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) - { - cluster.grids.erase(sporadicGrid); - } - } - for (auto &cluster : this->newClusterList) - { - if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) - { - cluster.grids.erase(sporadicGrid); - break; - } - } - } + removeGridList.push_back(grid); + } + // Else if (S1 && S2), mark as sporadic - Else mark as normal + else { + characteristicVec.isSporadic = checkIfSporadic(characteristicVec); + newGridList.insert(std::make_pair(grid, characteristicVec)); + } + } + // Else if (S1 && S2), mark as sporadic + else { + characteristicVec.isSporadic = checkIfSporadic(characteristicVec); + newGridList.insert(std::make_pair(grid, characteristicVec)); + } + } + mergeGridList(gridList, newGridList); + + // SESAME_INFO(" - Removed "<gridList.erase(sporadicGrid); + for (auto &cluster : this->clusterList) { + if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) { + cluster.grids.erase(sporadicGrid); + } + } + for (auto &cluster : this->newClusterList) { + if (cluster.grids.find(sporadicGrid) != cluster.grids.end()) { + cluster.grids.erase(sporadicGrid); + break; + } + } + } } /** - * Determines whether a sparse density grid is sporadic using rules S1 and S2 of Chen and Tu 2007 + * Determines whether a sparse density grid is sporadic using rules S1 and S2 of + * Chen and Tu 2007 * - * @param characteristicVec - the CharacteristicVector of the density grid being assessed for - * sporadicity + * @param characteristicVec - the CharacteristicVector of the density grid being + * assessed for sporadicity */ -bool SESAME::V9::checkIfSporadic(CharacteristicVector characteristicVec) -{ - // Check S1 - if (characteristicVec.getCurrGridDensity(currentTimeStamp, param.lambda) < param.outlier_cap) - { - // Check S2 TODO CHANGE REMOVE TIME FROM 0 TO -1 - if (characteristicVec.removeTime == 0 || - (currentTimeStamp - ((1 + param.beta) * characteristicVec.removeTime)) >= 0) - return true; - } - - return false; +bool SESAME::V9::checkIfSporadic(CharacteristicVector characteristicVec) { + // Check S1 + if (characteristicVec.getCurrGridDensity(currentTimeStamp, param.lambda) < + param.outlier_cap) { + // Check S2 TODO CHANGE REMOVE TIME FROM 0 TO -1 + if (characteristicVec.removeTime == 0 || + (currentTimeStamp - + ((1 + param.beta) * characteristicVec.removeTime)) >= 0) + return true; + } + + return false; } \ No newline at end of file diff --git a/src/Algorithm/EDMStream.cpp b/src/Algorithm/EDMStream.cpp index 7decab27..1bad112f 100644 --- a/src/Algorithm/EDMStream.cpp +++ b/src/Algorithm/EDMStream.cpp @@ -4,190 +4,171 @@ #include -SESAME::EDMStream::EDMStream(param_t &cmd_params) -{ - this->param = cmd_params; - this->EDMParam.num_points = cmd_params.num_points; - this->EDMParam.dim = cmd_params.dim; - this->EDMParam.alpha = cmd_params.alpha; - this->EDMParam.lamda = cmd_params.lambda; - this->EDMParam.beta = cmd_params.beta; - this->EDMParam.num_cache = cmd_params.num_cache; - this->EDMParam.radius = cmd_params.radius; - this->EDMParam.minDelta = cmd_params.delta; - this->EDMParam.opt = cmd_params.opt; +SESAME::EDMStream::EDMStream(param_t &cmd_params) { + this->param = cmd_params; + this->EDMParam.num_points = cmd_params.num_points; + this->EDMParam.dim = cmd_params.dim; + this->EDMParam.alpha = cmd_params.alpha; + this->EDMParam.lamda = cmd_params.lambda; + this->EDMParam.beta = cmd_params.beta; + this->EDMParam.num_cache = cmd_params.num_cache; + this->EDMParam.radius = cmd_params.radius; + this->EDMParam.minDelta = cmd_params.delta; + this->EDMParam.opt = cmd_params.opt; } -SESAME::EDMStream::~EDMStream(){}; - -void SESAME::EDMStream::Init() -{ - this->alpha = 0; - this->cache = - SESAME::DataStructureFactory::creatCache(this->EDMParam.num_cache, this->EDMParam.alpha, - this->EDMParam.lamda, this->EDMParam.radius); - this->outres = SESAME::DataStructureFactory::createOutlierReservoir( - this->EDMParam.radius, this->EDMParam.alpha, this->EDMParam.lamda); - this->dpTree = - SESAME::DataStructureFactory::createDPTree(this->actCluMaxNum, this->EDMParam.radius); - this->dpTree->SetMinDelta(this->EDMParam.minDelta); - sum_timer.Tick(); +void SESAME::EDMStream::Init() { + this->alpha = 0; + this->cache = SESAME::DataStructureFactory::creatCache( + this->EDMParam.num_cache, this->EDMParam.alpha, this->EDMParam.lamda, + this->EDMParam.radius); + this->outres = SESAME::DataStructureFactory::createOutlierReservoir( + this->EDMParam.radius, this->EDMParam.alpha, this->EDMParam.lamda); + this->dpTree = SESAME::DataStructureFactory::createDPTree( + this->actCluMaxNum, this->EDMParam.radius); + this->dpTree->SetMinDelta(this->EDMParam.minDelta); + sum_timer.Tick(); } -void SESAME::EDMStream::setMinDelta(double minDelta) -{ - this->EDMParam.minDelta = minDelta; - this->dpTree->SetMinDelta(minDelta); +void SESAME::EDMStream::setMinDelta(double minDelta) { + this->EDMParam.minDelta = minDelta; + this->dpTree->SetMinDelta(minDelta); } -void SESAME::EDMStream::InitDP(double time) -{ - cache->compDeltaRho(time); - SESAME_DEBUG("beta = " << this->EDMParam.beta); - this->minRho = this->EDMParam.beta / (1 - pow(this->EDMParam.alpha, this->EDMParam.lamda)); - SESAME_DEBUG("minRho = " << this->minRho); +void SESAME::EDMStream::InitDP(double time) { + cache->compDeltaRho(time); + SESAME_DEBUG("beta = " << this->EDMParam.beta); + this->minRho = this->EDMParam.beta / + (1 - pow(this->EDMParam.alpha, this->EDMParam.lamda)); + SESAME_DEBUG("minRho = " << this->minRho); - this->deltaT = - (log(1 - pow(this->EDMParam.alpha, this->EDMParam.lamda)) / log(this->EDMParam.alpha) - - log(this->EDMParam.beta) / log(this->EDMParam.alpha)) / - this->EDMParam.lamda; - // double deltaT = 100; - SESAME_DEBUG("deltaT = " << this->deltaT); - outres->setTimeGap(this->deltaT); - cache->getDPTree(this->minRho, this->EDMParam.minDelta, dpTree, outres, clusters); - SESAME_DEBUG("dpTree size = " << dpTree->GetSize()); - dpTree->SetLastTime(time); + this->deltaT = (log(1 - pow(this->EDMParam.alpha, this->EDMParam.lamda)) / + log(this->EDMParam.alpha) - + log(this->EDMParam.beta) / log(this->EDMParam.alpha)) / + this->EDMParam.lamda; + // double deltaT = 100; + SESAME_DEBUG("deltaT = " << this->deltaT); + outres->setTimeGap(this->deltaT); + cache->getDPTree(this->minRho, this->EDMParam.minDelta, dpTree, outres, + clusters); + SESAME_DEBUG("dpTree size = " << dpTree->GetSize()); + dpTree->SetLastTime(time); } -SESAME::DPNodePtr SESAME::EDMStream::streamProcess(SESAME::PointPtr p, int opt, double time) -{ - win_timer.Tick(); - double coef = pow(this->EDMParam.alpha, this->EDMParam.lamda * (time - dpTree->GetLastTime())); - dpTree->SetLastTime(time); - win_timer.Tock(); - ds_timer.Tick(); - auto nn = dpTree->findNN(p, coef, opt, time); - ds_timer.Tock(); - out_timer.Tick(); - if (nn == nullptr || nn->GetDis() > dpTree->GetCluR()) - { - nn = outres->insert(p, time); - if (nn->GetRho() > this->minRho) - { - outres->remove(nn); - dpTree->insert(nn, opt); - } +SESAME::DPNodePtr SESAME::EDMStream::streamProcess(SESAME::PointPtr p, int opt, + double time) { + win_timer.Tick(); + double coef = pow(this->EDMParam.alpha, + this->EDMParam.lamda * (time - dpTree->GetLastTime())); + dpTree->SetLastTime(time); + win_timer.Tock(); + ds_timer.Tick(); + auto nn = dpTree->findNN(p, coef, opt, time); + ds_timer.Tock(); + out_timer.Tick(); + if (nn == nullptr || nn->GetDis() > dpTree->GetCluR()) { + nn = outres->insert(p, time); + if (nn->GetRho() > this->minRho) { + outres->remove(nn); + dpTree->insert(nn, opt); } - dpTree->deleteInact(outres, this->minRho, time); - out_timer.Tock(); - return nn; + } + dpTree->deleteInact(outres, this->minRho, time); + out_timer.Tock(); + return nn; +} +double SESAME::EDMStream::computeAlpha() { + return dpTree->computeAlpha(this->EDMParam.minDelta); +} +double SESAME::EDMStream::adjustMinDelta() { + return dpTree->adjustMinDelta(this->alpha); } -double SESAME::EDMStream::computeAlpha() { return dpTree->computeAlpha(this->EDMParam.minDelta); } -double SESAME::EDMStream::adjustMinDelta() { return dpTree->adjustMinDelta(this->alpha); } -void SESAME::EDMStream::delCluster() -{ - for (auto it = this->clusters.begin(); it != this->clusters.end();) - { - auto cluster = it->get(); - if (cluster->GetCells().begin() == cluster->GetCells().end()) - { - this->clusters.erase(it++); - } - else - { - it++; - } +void SESAME::EDMStream::delCluster() { + for (auto it = this->clusters.begin(); it != this->clusters.end();) { + auto cluster = it->get(); + if (cluster->GetCells().begin() == cluster->GetCells().end()) { + this->clusters.erase(it++); + } else { + it++; } + } } -SESAME::DPNodePtr SESAME::EDMStream::retrive(SESAME::PointPtr p, int opt, double time) -{ - SESAME::PointPtr curP = p; - if (!this->EDMParam.isInit) - { - auto cc = cache->add(curP, time); - if (cache->isFull()) - { - // draw decision graph - InitDP(time); - this->alpha = computeAlpha(); // TODO: what does it mean? - SESAME_DEBUG("alpha = " << this->alpha); - this->EDMParam.isInit = true; - } - return cc; +SESAME::DPNodePtr SESAME::EDMStream::retrive(SESAME::PointPtr p, int opt, + double time) { + SESAME::PointPtr curP = p; + if (!this->EDMParam.isInit) { + auto cc = cache->add(curP, time); + if (cache->isFull()) { + // draw decision graph + InitDP(time); + this->alpha = computeAlpha(); // TODO: what does it mean? + SESAME_DEBUG("alpha = " << this->alpha); + this->EDMParam.isInit = true; } - else - { - ds_timer.Tick(); - auto nn = streamProcess(curP, opt, time); - this->dpTree->adjustCluster(clusters); - ds_timer.Tock(); + return cc; + } else { + ds_timer.Tick(); + auto nn = streamProcess(curP, opt, time); + this->dpTree->adjustCluster(clusters); + ds_timer.Tock(); - out_timer.Tick(); - delCluster(); - out_timer.Tock(); - return nn; - } + out_timer.Tick(); + delCluster(); + out_timer.Tock(); + return nn; + } } -void SESAME::EDMStream::CountNode(const SESAME::DPNodePtr &node, int &num) -{ - num = num + 1; - if (!node->GetSucs().empty()) - { - for (const SESAME::DPNodePtr &el : node->GetSucs()) - { - CountNode(el, num); - } +void SESAME::EDMStream::CountNode(const SESAME::DPNodePtr &node, int &num) { + num = num + 1; + if (!node->GetSucs().empty()) { + for (const SESAME::DPNodePtr &el : node->GetSucs()) { + CountNode(el, num); } + } } -void SESAME::EDMStream::RunOnline(SESAME::PointPtr input) -{ - double curTime = input->index; - auto c = retrive(input, this->EDMParam.opt, curTime); - if (input->getIndex() % 100 == 0 && this->EDMParam.isInit) - { - ds_timer.Tick(); - setMinDelta(adjustMinDelta()); - this->dpTree->adjustCluster(this->clusters); - ds_timer.Tock(); - out_timer.Tick(); - this->delCluster(); - out_timer.Tock(); - } - lat_timer.Add(input->toa); +void SESAME::EDMStream::RunOnline(SESAME::PointPtr input) { + double curTime = input->index; + auto c = retrive(input, this->EDMParam.opt, curTime); + if (input->getIndex() % 100 == 0 && this->EDMParam.isInit) { + ds_timer.Tick(); + setMinDelta(adjustMinDelta()); + this->dpTree->adjustCluster(this->clusters); + ds_timer.Tock(); + out_timer.Tick(); + this->delCluster(); + out_timer.Tock(); + } + lat_timer.Add(input->toa); } -void SESAME::EDMStream::RunOffline(SESAME::DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - int num = 0; - int sum = 0; - auto clu = 0; - for (const auto &cluster : this->clusters) - { - sum += num; - num = 0; - std::unordered_set cells = cluster->GetCells(); - for (const auto &cell : cells) - { - CountNode(cell->copy(), num); - PointPtr center = cell->GetCenter(); - center->setClusteringCenter(clu++); - center->setOutlier(false); - sinkPtr->put(center->copy()); - } - } - for (const auto &out : this->outres->getOutliers()) - { - sum += num; - num = 0; - CountNode(out->copy(), num); - PointPtr center = out->GetCenter(); - center->setClusteringCenter(clu++); - center->setOutlier(true); - sinkPtr->put(center->copy()); +void SESAME::EDMStream::RunOffline(SESAME::DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + int num = 0; + int sum = 0; + auto clu = 0; + for (const auto &cluster : this->clusters) { + sum += num; + num = 0; + std::unordered_set cells = cluster->GetCells(); + for (const auto &cell : cells) { + CountNode(cell->copy(), num); + PointPtr center = cell->GetCenter(); + center->setClusteringCenter(clu++); + center->setOutlier(false); + sinkPtr->put(center->copy()); } - ref_timer.Tock(); - sum_timer.Tock(); + } + for (const auto &out : this->outres->getOutliers()) { + sum += num; + num = 0; + CountNode(out->copy(), num); + PointPtr center = out->GetCenter(); + center->setClusteringCenter(clu++); + center->setOutlier(true); + sinkPtr->put(center->copy()); + } + ref_timer.Tock(); + sum_timer.Tock(); } diff --git a/src/Algorithm/OfflineRefinement/ConnectedRegions.cpp b/src/Algorithm/OfflineRefinement/ConnectedRegions.cpp index aa7daa76..c7c772b3 100644 --- a/src/Algorithm/OfflineRefinement/ConnectedRegions.cpp +++ b/src/Algorithm/OfflineRefinement/ConnectedRegions.cpp @@ -4,156 +4,153 @@ #include SESAME::ConnectedRegions::ConnectedRegions() {} -SESAME::ConnectedRegions::ConnectedRegions(double alpha, double min_weight) -{ - this->alpha = alpha; - this->min_weight = min_weight; +SESAME::ConnectedRegions::ConnectedRegions(double alpha, double min_weight) { + this->alpha = alpha; + this->min_weight = min_weight; } -void SESAME::ConnectedRegions::connection(std::vector µClusters, +void SESAME::ConnectedRegions::connection( + std::vector µClusters, - SESAME::WeightedAdjacencyList &weightedAdjacencyList) -{ - WeightedAdjacencyList::iterator iterW; - for (iterW = weightedAdjacencyList.begin(); iterW != weightedAdjacencyList.end(); iterW++) - { - // std::cout<<" cluster 1 weight "<first.microCluster1->weight - // <<", cluster 2 weight "<first.microCluster2->weight<<"weight min is - // "<first.microCluster1->weight >= min_weight && - iterW->first.microCluster2->weight >= min_weight) - { - double val = 2 * iterW->second->weight / - (iterW->first.microCluster1->weight + iterW->first.microCluster2->weight); - if (val > min_weight) - { - insertIntoGraph(microClusters, iterW->first.microCluster1->id.front(), - iterW->first.microCluster2->id.front()); + SESAME::WeightedAdjacencyList &weightedAdjacencyList) { + WeightedAdjacencyList::iterator iterW; + for (iterW = weightedAdjacencyList.begin(); + iterW != weightedAdjacencyList.end(); iterW++) { + // std::cout<<" cluster 1 weight "<first.microCluster1->weight + // <<", cluster 2 weight "<first.microCluster2->weight<<"weight min + // is + // "<first.microCluster1->weight >= min_weight && + iterW->first.microCluster2->weight >= min_weight) { + double val = 2 * iterW->second->weight / + (iterW->first.microCluster1->weight + + iterW->first.microCluster2->weight); + if (val > min_weight) { + insertIntoGraph(microClusters, iterW->first.microCluster1->id.front(), + iterW->first.microCluster2->id.front()); - insertIntoGraph(microClusters, iterW->first.microCluster2->id.front(), - iterW->first.microCluster1->id.front()); - } - else - { - insertIntoGraph(microClusters, iterW->first.microCluster1->id.front()); - insertIntoGraph(microClusters, iterW->first.microCluster2->id.front()); - } - } + insertIntoGraph(microClusters, iterW->first.microCluster2->id.front(), + iterW->first.microCluster1->id.front()); + } else { + insertIntoGraph(microClusters, iterW->first.microCluster1->id.front()); + insertIntoGraph(microClusters, iterW->first.microCluster2->id.front()); + } } - findConnectedComponents(microClusters); + } + findConnectedComponents(microClusters); } /** - * @Description: insert vertices and entries into connectivity graph when micro cluster pair - * connectivity value greater than the intersection threshold - * if the graph has testing micro cluster, add connected strong MC in the corresponding entries - * else, create new V,E into the graph + * @Description: insert vertices and entries into connectivity graph when micro + * cluster pair connectivity value greater than the intersection threshold if + * the graph has testing micro cluster, add connected strong MC in the + * corresponding entries else, create new V,E into the graph * @Param: connectivity graph, micro cluster 1 and 2 * @Return: void */ -void SESAME::ConnectedRegions::insertIntoGraph(const std::vector µClusters, - int microClusterId, int OtherId) -{ - if (connecvtivityGraphId.find(microClusterId) != connecvtivityGraphId.end()) - { - connecvtivityGraphId[microClusterId].push_back(OtherId); - } - else - { - auto microCluster = std::find_if( - microClusters.begin(), microClusters.end(), - [&](const MicroClusterPtr &mc) { return mc->id.front() == microClusterId; }); - (*microCluster)->visited = false; - std::vector newMicroClusterIdSet; - newMicroClusterIdSet.push_back(OtherId); - connecvtivityGraphId.insert(make_pair(microClusterId, OtherId)); - } +void SESAME::ConnectedRegions::insertIntoGraph( + const std::vector µClusters, int microClusterId, + int OtherId) { + if (connecvtivityGraphId.find(microClusterId) != connecvtivityGraphId.end()) { + connecvtivityGraphId[microClusterId].push_back(OtherId); + } else { + auto microCluster = std::find_if(microClusters.begin(), microClusters.end(), + [&](const MicroClusterPtr &mc) { + return mc->id.front() == microClusterId; + }); + (*microCluster)->visited = false; + std::vector newMicroClusterIdSet; + newMicroClusterIdSet.push_back(OtherId); + connecvtivityGraphId.insert(make_pair(microClusterId, OtherId)); + } } -void SESAME::ConnectedRegions::insertIntoGraph(const std::vector µClusters, - int microClusterId) -{ - if (connecvtivityGraphId.find(microClusterId) == connecvtivityGraphId.end()) - { - // std::cerr << "INSERT micro cluster id is " << microClusterId << std::endl; - auto microCluster = std::find_if( - microClusters.begin(), microClusters.end(), - [&](const MicroClusterPtr &mc) { return mc->id.front() == microClusterId; }); - (*microCluster)->visited = false; - std::vector newMicroClusterIdSet; - connecvtivityGraphId.insert(make_pair(microClusterId, newMicroClusterIdSet)); - } +void SESAME::ConnectedRegions::insertIntoGraph( + const std::vector µClusters, int microClusterId) { + if (connecvtivityGraphId.find(microClusterId) == connecvtivityGraphId.end()) { + // std::cerr << "INSERT micro cluster id is " << microClusterId << + // std::endl; + auto microCluster = std::find_if(microClusters.begin(), microClusters.end(), + [&](const MicroClusterPtr &mc) { + return mc->id.front() == microClusterId; + }); + (*microCluster)->visited = false; + std::vector newMicroClusterIdSet; + connecvtivityGraphId.insert( + make_pair(microClusterId, newMicroClusterIdSet)); + } } void SESAME::ConnectedRegions::findConnectedComponents( - const std::vector µClusters) -{ - unordered_map>::iterator iter; - // This variable just for indicating the id of micro cluster which forming macro clusters - for (iter = connecvtivityGraphId.begin(); iter != connecvtivityGraphId.end(); iter++) - { - std::vector idList; - auto microClusterKey = + const std::vector µClusters) { + unordered_map>::iterator iter; + // This variable just for indicating the id of micro cluster which forming + // macro clusters + for (iter = connecvtivityGraphId.begin(); iter != connecvtivityGraphId.end(); + iter++) { + std::vector idList; + auto microClusterKey = + std::find_if(microClusters.begin(), microClusters.end(), + [&](const MicroClusterPtr &mc) { + return mc->id.front() == iter->first; + }); + if (microClusterKey != microClusters.end() && + !(*microClusterKey)->visited) { + std::vector newCluster; + newCluster.push_back((*microClusterKey)); + idList.push_back(iter->first); + for (int iterValue : iter->second) { + auto microClusterElement = std::find_if(microClusters.begin(), microClusters.end(), - [&](const MicroClusterPtr &mc) { return mc->id.front() == iter->first; }); - if (microClusterKey != microClusters.end() && !(*microClusterKey)->visited) - { - std::vector newCluster; - newCluster.push_back((*microClusterKey)); - idList.push_back(iter->first); - for (int iterValue : iter->second) - { - auto microClusterElement = std::find_if( - microClusters.begin(), microClusters.end(), - [&](const MicroClusterPtr &mc) { return mc->id.front() == iterValue; }); - if (microClusterElement != microClusters.end()) - { - if (!(*microClusterElement)->visited) - { - newCluster.push_back((*microClusterElement)); - (*microClusterElement)->visited = true; - idList.push_back((*microClusterElement)->id.front()); - } - } - } - this->finalClusters.push_back(newCluster); + [&](const MicroClusterPtr &mc) { + return mc->id.front() == iterValue; + }); + if (microClusterElement != microClusters.end()) { + if (!(*microClusterElement)->visited) { + newCluster.push_back((*microClusterElement)); + (*microClusterElement)->visited = true; + idList.push_back((*microClusterElement)->id.front()); + } } + } + this->finalClusters.push_back(newCluster); } + } } -std::vector SESAME::ConnectedRegions::ResultsToDataSink() -{ - // SESAME_INFO("Start resize "< SESAME::ConnectedRegions::ResultsToDataSink() { + // SESAME_INFO("Start resize "< points; - for (auto iter = 0; iter != finalClusters.size(); iter++) - { // initialize pseudo point of macro clusters - PointPtr point = - DataStructureFactory::createPoint(iter, 0, finalClusters.at(iter).front()->dim, 0); - // This is just for testing, need to delete - std::vector centroid(finalClusters.at(iter).front()->dim, 0); - // TODO maybe wrong ;so dizzy - for (auto j = 0; j < finalClusters.at(iter).size(); j++) - { - double currentWeight = point->getWeight() + finalClusters.at(iter).at(j)->weight; - point->setWeight(currentWeight); - for (auto a = 0; a < finalClusters.at(iter).at(j)->dim; a++) - { - if (j == 0) point->setFeatureItem(0, a); - point->setFeatureItem( - point->getFeatureItem(a) + finalClusters.at(iter).at(j)->centroid.at(a), a); - centroid[a] = point->getFeatureItem(a); // testing - if (j == finalClusters.at(iter).size() - 1) - { - point->setFeatureItem( - point->getFeatureItem(a) / finalClusters.at(iter).at(j)->dim, a); - centroid[a] = centroid[a] / finalClusters.at(iter).at(j)->dim; // testing - } - } + std::vector points; + for (auto iter = 0; iter != finalClusters.size(); + iter++) { // initialize pseudo point of macro clusters + PointPtr point = + GenericFactory::New(finalClusters.at(iter).front()->dim, iter); + // This is just for testing, need to delete + std::vector centroid(finalClusters.at(iter).front()->dim, 0); + // TODO maybe wrong ;so dizzy + for (auto j = 0; j < finalClusters.at(iter).size(); j++) { + double currentWeight = + point->getWeight() + finalClusters.at(iter).at(j)->weight; + point->setWeight(currentWeight); + for (auto a = 0; a < finalClusters.at(iter).at(j)->dim; a++) { + if (j == 0) + point->setFeatureItem(0, a); + point->setFeatureItem(point->getFeatureItem(a) + + finalClusters.at(iter).at(j)->centroid.at(a), + a); + centroid[a] = point->getFeatureItem(a); // testing + if (j == finalClusters.at(iter).size() - 1) { + point->setFeatureItem( + point->getFeatureItem(a) / finalClusters.at(iter).at(j)->dim, a); + centroid[a] = + centroid[a] / finalClusters.at(iter).at(j)->dim; // testing } - points.push_back(point); - // std::stringstream results; - // std::copy(centroid.begin(),centroid.end(),std::ostream_iterator(results, " ")); - // SESAME_INFO("The NO."<(results, + // " ")); SESAME_INFO("The NO."<min_points = minPts; - this->epsilon = eps; - // this->pointSize = size; - this->clusterID = 0; +SESAME::DBSCAN::DBSCAN(unsigned int minPts, float eps) { + this->min_points = minPts; + this->epsilon = eps; + // this->pointSize = size; + this->clusterID = 0; } -SESAME::DBSCAN::DBSCAN() -{ - this->min_points = 0; - this->epsilon = 0; - // this->pointSize = size; - this->clusterID = 0; +SESAME::DBSCAN::DBSCAN() { + this->min_points = 0; + this->epsilon = 0; + // this->pointSize = size; + this->clusterID = 0; } SESAME::DBSCAN::~DBSCAN() = default; -void SESAME::DBSCAN::run(std::vector &input) -{ - for (auto &i : input) - { - i->setClusteringCenter(UNCLASSIFIED); - } - for (int i = 0; i < input.size(); i++) - { - if (input[i]->getClusteringCenter() == UNCLASSIFIED) - { - if (expandCluster(input, input[i], clusterID) != FAILURE) - { - clusterID += 1; - } - } +void SESAME::DBSCAN::run(std::vector &input) { + for (auto &i : input) { + i->setClusteringCenter(UNCLASSIFIED); + } + for (int i = 0; i < input.size(); i++) { + if (input[i]->getClusteringCenter() == UNCLASSIFIED) { + if (expandCluster(input, input[i], clusterID) != FAILURE) { + clusterID += 1; + } } + } } void SESAME::DBSCAN::Run(SesameParam ¶m, std::vector &input, - SESAME::DataSinkPtr sinkPtr) -{ - this->min_points = param.min_points; - this->epsilon = param.epsilon; - // this->pointSize = size; - for (auto &i : input) - { - i->setClusteringCenter(UNCLASSIFIED); + SESAME::DataSinkPtr sinkPtr) { + this->min_points = param.min_points; + this->epsilon = param.epsilon; + // this->pointSize = size; + for (auto &i : input) { + i->setClusteringCenter(UNCLASSIFIED); + } + this->clusterID = 0; + for (int i = 0; i < input.size(); i++) { + if (input[i]->getClusteringCenter() == UNCLASSIFIED) { + if (expandCluster(input, input[i], clusterID) != FAILURE) { + clusterID += 1; + } } - this->clusterID = 0; - for (int i = 0; i < input.size(); i++) - { - if (input[i]->getClusteringCenter() == UNCLASSIFIED) - { - if (expandCluster(input, input[i], clusterID) != FAILURE) - { - clusterID += 1; - } - } - } - produceResult(input, sinkPtr); + } + produceResult(input, sinkPtr); } int SESAME::DBSCAN::expandCluster(std::vector &input, PointPtr &point, - int clusterID) const -{ - std::vector clusterSeeds = calculateCluster(input, point); - if (clusterSeeds.size() < min_points) - { - point->setClusteringCenter(NOISE); - return FAILURE; - } - else - { - int index, indexCorePoint = 0; - for (int iterSeeds = 0; iterSeeds < clusterSeeds.size(); iterSeeds++) - { - index = clusterSeeds.at(iterSeeds); + int clusterID) const { + std::vector clusterSeeds = calculateCluster(input, point); + if (clusterSeeds.size() < min_points) { + point->setClusteringCenter(NOISE); + return FAILURE; + } else { + int index, indexCorePoint = 0; + for (int iterSeeds = 0; iterSeeds < clusterSeeds.size(); iterSeeds++) { + index = clusterSeeds.at(iterSeeds); - input.at(index)->setClusteringCenter(clusterID); - if (judgeCorePoint(input.at(index), - point)) // check if the seed point in input is the core point? - indexCorePoint = iterSeeds; - } - clusterSeeds.erase(clusterSeeds.begin() + indexCorePoint); + input.at(index)->setClusteringCenter(clusterID); + if (judgeCorePoint( + input.at(index), + point)) // check if the seed point in input is the core point? + indexCorePoint = iterSeeds; + } + clusterSeeds.erase(clusterSeeds.begin() + indexCorePoint); - for (std::vector::size_type i = 0, currentSize = clusterSeeds.size(); i < currentSize; - i++) // ++i or i++? - { - std::vector clusterNeighbors = calculateCluster(input, input.at(clusterSeeds[i])); - if (clusterNeighbors.size() >= min_points) - { - for (std::vector::size_type iterNeighbors = 0; - iterNeighbors < clusterNeighbors.size(); iterNeighbors++) - { - index = clusterNeighbors.at(iterNeighbors); - if (input.at(index)->getClusteringCenter() == UNCLASSIFIED || - input.at(index)->getClusteringCenter() == NOISE) - { - if (input.at(index)->getClusteringCenter() == UNCLASSIFIED) - { - clusterSeeds.push_back(index); - currentSize = clusterSeeds.size(); - } - input.at(index)->setClusteringCenter(clusterID); - } - } + for (std::vector::size_type i = 0, currentSize = clusterSeeds.size(); + i < currentSize; i++) // ++i or i++? + { + std::vector clusterNeighbors = + calculateCluster(input, input.at(clusterSeeds[i])); + if (clusterNeighbors.size() >= min_points) { + for (std::vector::size_type iterNeighbors = 0; + iterNeighbors < clusterNeighbors.size(); iterNeighbors++) { + index = clusterNeighbors.at(iterNeighbors); + if (input.at(index)->getClusteringCenter() == UNCLASSIFIED || + input.at(index)->getClusteringCenter() == NOISE) { + if (input.at(index)->getClusteringCenter() == UNCLASSIFIED) { + clusterSeeds.push_back(index); + currentSize = clusterSeeds.size(); } + input.at(index)->setClusteringCenter(clusterID); + } } - return SUCCESS; + } } + return SUCCESS; + } } std::vector SESAME::DBSCAN::calculateCluster(std::vector &input, - PointPtr &point) const -{ - std::vector clusterIndex; - for (int i = 0; i < input.size(); i++) - { - if (calculateEluDistance(point, input[i]) <= epsilon) clusterIndex.push_back(i); - } - return clusterIndex; + PointPtr &point) const { + std::vector clusterIndex; + for (int i = 0; i < input.size(); i++) { + if (calculateEluDistance(point, input[i]) <= epsilon) + clusterIndex.push_back(i); + } + return clusterIndex; } -double SESAME::DBSCAN::calculateEluDistance(PointPtr &point, PointPtr ¢er) -{ - double dist = 0; - for (int i = 0; i < point->getDimension(); i++) - { - dist += pow(point->getFeatureItem(i) - center->getFeatureItem(i), 2); - } - dist = sqrt(dist); - return dist; +double SESAME::DBSCAN::calculateEluDistance(PointPtr &point, PointPtr ¢er) { + double dist = 0; + for (int i = 0; i < point->getDimension(); i++) { + dist += pow(point->getFeatureItem(i) - center->getFeatureItem(i), 2); + } + dist = sqrt(dist); + return dist; } -bool SESAME::DBSCAN::judgeCorePoint(PointPtr &point, PointPtr &other) -{ - bool corePoint = true; - for (int i = 0; i < point->getDimension(); i++) - { - if (point->getFeatureItem(i) != other->getFeatureItem(i)) corePoint = false; - } - return corePoint; +bool SESAME::DBSCAN::judgeCorePoint(PointPtr &point, PointPtr &other) { + bool corePoint = true; + for (int i = 0; i < point->getDimension(); i++) { + if (point->getFeatureItem(i) != other->getFeatureItem(i)) + corePoint = false; + } + return corePoint; } -// TODO: whether to output the noise data? In our implementation, we output the noise data since it -// is still a cluster -void SESAME::DBSCAN::produceResult(std::vector &input, SESAME::DataSinkPtr sinkPtr) -{ - for (auto el : input) - { - if (el->getClusteringCenter() == UNCLASSIFIED or NOISE) - { // not noise or unclassified - el->setClusteringCenter(clusterID++); - } - sinkPtr->put(el); // point index start from 0 +// TODO: whether to output the noise data? In our implementation, we output the +// noise data since it is still a cluster +void SESAME::DBSCAN::produceResult(std::vector &input, + SESAME::DataSinkPtr sinkPtr) { + for (auto el : input) { + if (el->getClusteringCenter() == UNCLASSIFIED or + NOISE) { // not noise or unclassified + el->setClusteringCenter(clusterID++); } + sinkPtr->put(el); // point index start from 0 + } } diff --git a/src/Algorithm/OfflineRefinement/KMeans.cpp b/src/Algorithm/OfflineRefinement/KMeans.cpp index 9c05bb2e..495c9bb2 100644 --- a/src/Algorithm/OfflineRefinement/KMeans.cpp +++ b/src/Algorithm/OfflineRefinement/KMeans.cpp @@ -1,250 +1,224 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 20/07/2021. // #include -#include #include +#include #include /** - * @Description: First step: select k elements randomly in input as initial clustering centers + * @Description: First step: select k elements randomly in input as initial + * clustering centers */ void SESAME::KMeans::randomSelectCenters(int numberOfCenters, int numberOfInput, std::vector &input, - std::vector ¢ers) -{ - // init_genrand(10); - // put one point into the result - - std::vector indexs; // for unique - int id = rand() % numberOfInput; - indexs.push_back(input[id]->getIndex()); - centers.push_back(input[id]->copy()); - int c = 1; // count the number of element in center - // run the loop to insert the randomly selected indexes - - while (c < numberOfCenters) - { - id = rand() % numberOfInput; - if (count(indexs.begin(), indexs.end(), id) == 0) - { - indexs.push_back(input[id]->getIndex()); - centers.push_back(input[id]->copy()); - c++; - } + std::vector ¢ers) { + // init_genrand(10); + // put one point into the result + + std::vector indexs; // for unique + int id = rand() % numberOfInput; + indexs.push_back(input[id]->getIndex()); + centers.push_back(input[id]->copy()); + int c = 1; // count the number of element in center + // run the loop to insert the randomly selected indexes + + while (c < numberOfCenters) { + id = rand() % numberOfInput; + if (count(indexs.begin(), indexs.end(), id) == 0) { + indexs.push_back(input[id]->getIndex()); + centers.push_back(input[id]->copy()); + c++; } - - // print information - if (indexs.size() != numberOfCenters) - SESAME_INFO("ERROR!!! number of centers in indexs is not right!"); - // TODO : I commented out the following output - /* - * cout << "Randomly selected indexes:"; - for (int index : indexs) { - cout << index << " "; - } - cout << endl; - */ + } + + // print information + if (indexs.size() != numberOfCenters) + SESAME_INFO("ERROR!!! number of centers in indexs is not right!"); + // TODO : I commented out the following output + /* + * cout << "Randomly selected indexes:"; + for (int index : indexs) { + cout << index << " "; + } + cout << endl; + */ } /** - * @Description: select the k initial centers from the function determined by point weight + * @Description: select the k initial centers from the function determined by + * point weight */ -void SESAME::KMeans::selectCentersFromWeight(int numberOfCenters, int numberOfInput, +void SESAME::KMeans::selectCentersFromWeight(int numberOfCenters, + int numberOfInput, std::vector &input, - std::vector ¢ers) -{ - std::vector indexs; // for unique - indexs.push_back(centers.at(0)->getIndex()); - // cout << "KMeans++ randomly select indexes: "; - int times = 0; - while (times < numberOfCenters) - { - double sum = 0; - vector leftOver; - std::vector weightSquare, probability; - for (int i = 0; i < numberOfInput; i++) - { - if (count(indexs.begin(), indexs.end(), input.at(i)->getIndex()) == 0) - { - leftOver.push_back(input.at(i)->copy()); - double Min = calculateEluDistance(input.at(i), centers.at(0)); - // Min is D(x) - for (int j = 1; j < centers.size(); j++) - { - if (Min > calculateEluDistance(input.at(i), centers.at(j))) - { - Min = calculateEluDistance(input.at(i), centers.at(j)); - } - } - weightSquare.push_back(pow(Min, 2)); - sum += pow(Min, 2); - // here we only need to store D2.txt(x) - } + std::vector ¢ers) { + std::vector indexs; // for unique + indexs.push_back(centers.at(0)->getIndex()); + // cout << "KMeans++ randomly select indexes: "; + int times = 0; + while (times < numberOfCenters) { + double sum = 0; + vector leftOver; + std::vector weightSquare, probability; + for (int i = 0; i < numberOfInput; i++) { + if (count(indexs.begin(), indexs.end(), input.at(i)->getIndex()) == 0) { + leftOver.push_back(input.at(i)->copy()); + double Min = calculateEluDistance(input.at(i), centers.at(0)); + // Min is D(x) + for (int j = 1; j < centers.size(); j++) { + if (Min > calculateEluDistance(input.at(i), centers.at(j))) { + Min = calculateEluDistance(input.at(i), centers.at(j)); + } } + weightSquare.push_back(pow(Min, 2)); + sum += pow(Min, 2); + // here we only need to store D2.txt(x) + } + } - // calculate the weight of every other point(except centers) - for (double i : weightSquare) - { - probability.push_back(i / sum); - } + // calculate the weight of every other point(except centers) + for (double i : weightSquare) { + probability.push_back(i / sum); + } - double left = 0, right = 0; - double r = ((double)rand() / (RAND_MAX)); - for (int i = 0; i < leftOver.size(); i++) - { - right += probability.at(i); - if (r > left && r <= right) - { - // cout << "randomly generate number: " << u(e) << endl; - if (count(indexs.begin(), indexs.end(), leftOver.at(i)->getIndex()) == 0) - { - indexs.push_back(leftOver.at(i)->getIndex()); - centers.push_back(leftOver.at(i)->copy()); - // cout << leftOver.at(i)->getIndex() << " "; - break; - } - } - left = right; + double left = 0, right = 0; + double r = ((double)rand() / (RAND_MAX)); + for (int i = 0; i < leftOver.size(); i++) { + right += probability.at(i); + if (r > left && r <= right) { + // cout << "randomly generate number: " << u(e) << endl; + if (count(indexs.begin(), indexs.end(), leftOver.at(i)->getIndex()) == + 0) { + indexs.push_back(leftOver.at(i)->getIndex()); + centers.push_back(leftOver.at(i)->copy()); + // cout << leftOver.at(i)->getIndex() << " "; + break; } - times++; + } + left = right; } - // cout << endl; + times++; + } + // cout << endl; } /** * @Description: Calculate norm2 distance from point to one of the center */ -double SESAME::KMeans::calculateEluDistance(PointPtr &point, PointPtr ¢er) -{ - double dist = 0; - for (int i = 0; i < point->getDimension(); i++) - { - dist += pow(point->getFeatureItem(i) - center->getFeatureItem(i), 2); - } - dist = sqrt(dist); - return dist; +double SESAME::KMeans::calculateEluDistance(PointPtr &point, PointPtr ¢er) { + double dist = 0; + for (int i = 0; i < point->getDimension(); i++) { + dist += pow(point->getFeatureItem(i) - center->getFeatureItem(i), 2); + } + dist = sqrt(dist); + return dist; } /** * @Description: Calculate the new clustering center from groups */ -void SESAME::KMeans::calculateClusterCenter(PointPtr ¢er, std::vector &group) -{ - for (int i = 0; i < center->getDimension(); i++) - { - double sum = 0; - if (!group.empty()) - { - for (auto &j : group) - { - sum += j->getFeatureItem(i); - } - center->setFeatureItem(sum / double(group.size()), i); - } +void SESAME::KMeans::calculateClusterCenter(PointPtr ¢er, + std::vector &group) { + for (int i = 0; i < center->getDimension(); i++) { + double sum = 0; + if (!group.empty()) { + for (auto &j : group) { + sum += j->getFeatureItem(i); + } + center->setFeatureItem(sum / double(group.size()), i); } + } } /** * @Description: Second Step: group every input points to the nearest centers */ -void SESAME::KMeans::groupPointsByCenters(int numberOfCenters, int numberOfInput, - std::vector &input, - std::vector ¢ers, - std::vector> &groups) -{ - for (int i = 0; i < centers.size(); i++) - { - std::vector initial; - groups.push_back(initial); - } - int Id; - for (int i = 0; i < input.size(); i++) - { - double Min = calculateEluDistance(input.at(i), centers.at(0)); - Id = 0; // cluster_id that the point belongs to - for (int j = 1; j < centers.size(); j++) - { - if (Min > calculateEluDistance(input.at(i), centers.at(j))) - { - Id = j; - Min = calculateEluDistance(input.at(i), centers.at(j)); - } - } - groups[Id].push_back(input.at(i)); +void SESAME::KMeans::groupPointsByCenters( + int numberOfCenters, int numberOfInput, std::vector &input, + std::vector ¢ers, + std::vector> &groups) { + for (int i = 0; i < centers.size(); i++) { + std::vector initial; + groups.push_back(initial); + } + int Id; + for (int i = 0; i < input.size(); i++) { + double Min = calculateEluDistance(input.at(i), centers.at(0)); + Id = 0; // cluster_id that the point belongs to + for (int j = 1; j < centers.size(); j++) { + if (Min > calculateEluDistance(input.at(i), centers.at(j))) { + Id = j; + Min = calculateEluDistance(input.at(i), centers.at(j)); + } } + groups[Id].push_back(input.at(i)); + } } /** - * @Description: Third Step: choose new clustering center from group points again + * @Description: Third Step: choose new clustering center from group points + * again */ -void SESAME::KMeans::adjustClusteringCenters(std::vector ¢ers, - std::vector> &groups) -{ - for (int i = 0; i < groups.size(); i++) - { - calculateClusterCenter(centers[i], groups[i]); - } +void SESAME::KMeans::adjustClusteringCenters( + std::vector ¢ers, + std::vector> &groups) { + for (int i = 0; i < groups.size(); i++) { + calculateClusterCenter(centers[i], groups[i]); + } } /** - * @Description: refresh group,replace the old one with the new one and clean up the new group + * @Description: refresh group,replace the old one with the new one and clean up + * the new group */ -void SESAME::KMeans::refreshGroup(std::vector> &oldGroups, - std::vector> &newGroups) -{ - oldGroups.assign(newGroups.begin(), newGroups.end()); - newGroups.clear(); - std::vector> tmp; - newGroups.swap(tmp); +void SESAME::KMeans::refreshGroup( + std::vector> &oldGroups, + std::vector> &newGroups) { + oldGroups.assign(newGroups.begin(), newGroups.end()); + newGroups.clear(); + std::vector> tmp; + newGroups.swap(tmp); } /** - * @Description: if the old groups equals the new ones, set flagToStop to true to stop KMeans + * @Description: if the old groups equals the new ones, set flagToStop to true + * to stop KMeans */ -void SESAME::KMeans::checkStopStatus(bool &flag, std::vector> &oldGroups, - std::vector> &newGroups) -{ - flag = true; - if (oldGroups.size() == newGroups.size()) - { - for (int i = 0; i < oldGroups.size(); i++) - { - if (oldGroups[i].size() == newGroups[i].size()) - { - for (int j = 1; j < oldGroups[i].size(); j++) - { - if (oldGroups[i][j]->getIndex() != newGroups[i][j]->getIndex()) - { - flag = false; - } - } - } - else - { - flag = false; - } +void SESAME::KMeans::checkStopStatus( + bool &flag, std::vector> &oldGroups, + std::vector> &newGroups) { + flag = true; + if (oldGroups.size() == newGroups.size()) { + for (int i = 0; i < oldGroups.size(); i++) { + if (oldGroups[i].size() == newGroups[i].size()) { + for (int j = 1; j < oldGroups[i].size(); j++) { + if (oldGroups[i][j]->getIndex() != newGroups[i][j]->getIndex()) { + flag = false; + } } - } - else - { + } else { flag = false; + } } - if (!flag) - { - // SESAME_INFO("Point cluster need to be adjust, start a new iteration!"); - } + } else { + flag = false; + } + if (!flag) { + // SESAME_INFO("Point cluster need to be adjust, start a new iteration!"); + } } /** @@ -252,28 +226,23 @@ void SESAME::KMeans::checkStopStatus(bool &flag, std::vector> &groups, - std::vector &output) -{ - for (int i = 0; i < groups.size(); i++) - { - for (int j = 0; j < groups[i].size(); j++) - { - groups[i][j]->setClusteringCenter(i); - output.push_back(groups[i][j]->copy()); // point index start from 0 - } + std::vector &output) { + for (int i = 0; i < groups.size(); i++) { + for (int j = 0; j < groups[i].size(); j++) { + groups[i][j]->setClusteringCenter(i); + output.push_back(groups[i][j]->copy()); // point index start from 0 } + } } -void SESAME::KMeans::produceResult(std::vector> &groups, - SESAME::DataSinkPtr sinkPtr) -{ - for (int i = 0; i < groups.size(); i++) - { - for (int j = 0; j < groups[i].size(); j++) - { - groups[i][j]->setClusteringCenter(i); - sinkPtr->put(groups[i][j]->copy()); // point index start from 0 - } +void SESAME::KMeans::produceResult( + std::vector> &groups, + SESAME::DataSinkPtr sinkPtr) { + for (int i = 0; i < groups.size(); i++) { + for (int j = 0; j < groups[i].size(); j++) { + groups[i][j]->setClusteringCenter(i); + sinkPtr->put(groups[i][j]->copy()); // point index start from 0 } + } } /** @@ -281,206 +250,182 @@ void SESAME::KMeans::produceResult(std::vector> &g */ void SESAME::KMeans::runKMeans(int numberOfCenters, int numberOfInput, - std::vector ¢ers, std::vector &input, + std::vector ¢ers, + std::vector &input, std::vector> &oldGroups, - std::vector> &newGroups, int seed, - bool kmeanspp) -{ - bool flagToStop = false; - srand(seed); - if (kmeanspp) - { - // run the first step in KMeans++ - SESAME_INFO("KMeans++ start!!!"); - randomSelectCenters(1, numberOfInput, input, centers); - int resetCenter = numberOfCenters - 1; - selectCentersFromWeight(resetCenter, numberOfInput, input, centers); - - // run the second step in KMeans++ - groupPointsByCenters(numberOfCenters, numberOfInput, input, centers, oldGroups); + std::vector> &newGroups, + int seed, bool kmeanspp) { + bool flagToStop = false; + srand(seed); + if (kmeanspp) { + // run the first step in KMeans++ + SESAME_INFO("KMeans++ start!!!"); + randomSelectCenters(1, numberOfInput, input, centers); + int resetCenter = numberOfCenters - 1; + selectCentersFromWeight(resetCenter, numberOfInput, input, centers); + + // run the second step in KMeans++ + groupPointsByCenters(numberOfCenters, numberOfInput, input, centers, + oldGroups); + } else { + SESAME_INFO("KMeans start!!!"); + // run the first step in KMeans + randomSelectCenters(numberOfCenters, numberOfInput, input, centers); + + // run the second step in KMeans + groupPointsByCenters(numberOfCenters, numberOfInput, input, centers, + oldGroups); + } + + do { + // run the third step in KMeans + adjustClusteringCenters(centers, oldGroups); + + // run the second step in KMeans + groupPointsByCenters(numberOfCenters, numberOfInput, input, centers, + newGroups); + + // check whether to stop + checkStopStatus(flagToStop, oldGroups, newGroups); + + // refresh the groups, store newGroups in oldGroups and clean newGroups + refreshGroup(oldGroups, newGroups); + + } while (!flagToStop); + if (kmeanspp) { + SESAME_INFO("KMeans++ sourceEnd!!!"); + } else { + SESAME_INFO("KMeans sourceEnd!!!"); + } +} + +void SESAME::KMeans::Run(SESAME::SesameParam ¶m, + vector &online_centers, + SESAME::DataSinkPtr sinkPtr) { + srand(param.seed); + if (online_centers.size() <= param.k or param.k < 2) { + int i = 0; + for (auto el : online_centers) { + el->setClusteringCenter(i++); + sinkPtr->put(el); } - else - { - SESAME_INFO("KMeans start!!!"); - // run the first step in KMeans - randomSelectCenters(numberOfCenters, numberOfInput, input, centers); - - // run the second step in KMeans - groupPointsByCenters(numberOfCenters, numberOfInput, input, centers, oldGroups); + } else { + bool flagToStop = false; + int numberOfCenters = param.k; + int numberOfInput = (int)online_centers.size(); + std::vector offlineCenters; + std::vector> oldGroups, newGroups; + + if (param.kmeanspp) { + // run the first step in KMeans++ + SESAME_INFO("KMeans++ start!!!"); + randomSelectCenters(1, numberOfInput, online_centers, offlineCenters); + int resetCenter = numberOfCenters - 1; + selectCentersFromWeight(resetCenter, numberOfInput, online_centers, + offlineCenters); + + // run the second step in KMeans++ + groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters, oldGroups); + } else { + SESAME_INFO("KMeans start!!!"); + // run the first step in KMeans + randomSelectCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters); + + // run the second step in KMeans + groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters, oldGroups); } - do - { - // run the third step in KMeans - adjustClusteringCenters(centers, oldGroups); + do { + // run the third step in KMeans + adjustClusteringCenters(offlineCenters, oldGroups); - // run the second step in KMeans - groupPointsByCenters(numberOfCenters, numberOfInput, input, centers, newGroups); + // run the second step in KMeans + groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters, newGroups); - // check whether to stop - checkStopStatus(flagToStop, oldGroups, newGroups); + // check whether to stop + checkStopStatus(flagToStop, oldGroups, newGroups); - // refresh the groups, store newGroups in oldGroups and clean newGroups - refreshGroup(oldGroups, newGroups); + // refresh the groups, store newGroups in oldGroups and clean newGroups + refreshGroup(oldGroups, newGroups); } while (!flagToStop); - if (kmeanspp) - { - SESAME_INFO("KMeans++ sourceEnd!!!"); - } - else - { - SESAME_INFO("KMeans sourceEnd!!!"); + if (param.kmeanspp) { + SESAME_INFO("KMeans++ sourceEnd!!!"); + } else { + SESAME_INFO("KMeans sourceEnd!!!"); } + produceResult(oldGroups, sinkPtr); + } } -void SESAME::KMeans::Run(SESAME::SesameParam ¶m, vector &online_centers, - SESAME::DataSinkPtr sinkPtr) -{ - srand(param.seed); - if (online_centers.size() <= param.k or param.k < 2) - { - int i = 0; - for (auto el : online_centers) - { - el->setClusteringCenter(i++); - sinkPtr->put(el); - } - } - else - { - bool flagToStop = false; - int numberOfCenters = param.k; - int numberOfInput = (int)online_centers.size(); - std::vector offlineCenters; - std::vector> oldGroups, newGroups; - - if (param.kmeanspp) - { - // run the first step in KMeans++ - SESAME_INFO("KMeans++ start!!!"); - randomSelectCenters(1, numberOfInput, online_centers, offlineCenters); - int resetCenter = numberOfCenters - 1; - selectCentersFromWeight(resetCenter, numberOfInput, online_centers, offlineCenters); - - // run the second step in KMeans++ - groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters, - oldGroups); - } - else - { - SESAME_INFO("KMeans start!!!"); - // run the first step in KMeans - randomSelectCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters); - - // run the second step in KMeans - groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters, - oldGroups); - } - - do - { - // run the third step in KMeans - adjustClusteringCenters(offlineCenters, oldGroups); - - // run the second step in KMeans - groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters, - newGroups); - - // check whether to stop - checkStopStatus(flagToStop, oldGroups, newGroups); - - // refresh the groups, store newGroups in oldGroups and clean newGroups - refreshGroup(oldGroups, newGroups); - - } while (!flagToStop); - if (param.kmeanspp) - { - SESAME_INFO("KMeans++ sourceEnd!!!"); - } - else - { - SESAME_INFO("KMeans sourceEnd!!!"); - } - produceResult(oldGroups, sinkPtr); +void SESAME::KMeans::Run(SESAME::SesameParam ¶m, + vector &online_centers, + vector &results) { + srand(param.seed); + if (online_centers.size() <= param.k or param.k < 2) { + int i = 0; + for (auto el : online_centers) { + el->setClusteringCenter(i++); + results.push_back(el); } -} - -void SESAME::KMeans::Run(SESAME::SesameParam ¶m, vector &online_centers, - vector &results) -{ - srand(param.seed); - if (online_centers.size() <= param.k or param.k < 2) - { - int i = 0; - for (auto el : online_centers) - { - el->setClusteringCenter(i++); - results.push_back(el); - } + } else { + bool flagToStop = false; + int numberOfCenters = param.k; + int numberOfInput = (int)online_centers.size(); + std::vector offlineCenters; + std::vector> oldGroups, newGroups; + + if (param.kmeanspp) { + // run the first step in KMeans++ + SESAME_INFO("KMeans++ start!!!"); + randomSelectCenters(1, numberOfInput, online_centers, offlineCenters); + int resetCenter = numberOfCenters - 1; + selectCentersFromWeight(resetCenter, numberOfInput, online_centers, + offlineCenters); + + // run the second step in KMeans++ + groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters, oldGroups); + } else { + SESAME_INFO("KMeans start!!!"); + // run the first step in KMeans + randomSelectCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters); + + // run the second step in KMeans + groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters, oldGroups); } - else - { - bool flagToStop = false; - int numberOfCenters = param.k; - int numberOfInput = (int)online_centers.size(); - std::vector offlineCenters; - std::vector> oldGroups, newGroups; - - if (param.kmeanspp) - { - // run the first step in KMeans++ - SESAME_INFO("KMeans++ start!!!"); - randomSelectCenters(1, numberOfInput, online_centers, offlineCenters); - int resetCenter = numberOfCenters - 1; - selectCentersFromWeight(resetCenter, numberOfInput, online_centers, offlineCenters); - - // run the second step in KMeans++ - groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters, - oldGroups); - } - else - { - SESAME_INFO("KMeans start!!!"); - // run the first step in KMeans - randomSelectCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters); - - // run the second step in KMeans - groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters, - oldGroups); - } - do - { - // run the third step in KMeans - adjustClusteringCenters(offlineCenters, oldGroups); + do { + // run the third step in KMeans + adjustClusteringCenters(offlineCenters, oldGroups); - // run the second step in KMeans - groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, offlineCenters, - newGroups); + // run the second step in KMeans + groupPointsByCenters(numberOfCenters, numberOfInput, online_centers, + offlineCenters, newGroups); - // check whether to stop - checkStopStatus(flagToStop, oldGroups, newGroups); + // check whether to stop + checkStopStatus(flagToStop, oldGroups, newGroups); - // refresh the groups, store newGroups in oldGroups and clean newGroups - refreshGroup(oldGroups, newGroups); + // refresh the groups, store newGroups in oldGroups and clean newGroups + refreshGroup(oldGroups, newGroups); - } while (!flagToStop); - if (param.kmeanspp) - { - SESAME_INFO("KMeans++ sourceEnd!!!"); - } - else - { - SESAME_INFO("KMeans sourceEnd!!!"); - } - for (int i = 0; i < oldGroups.size(); i++) - { - for (int j = 0; j < oldGroups[i].size(); j++) - { - oldGroups[i][j]->setClusteringCenter(i); - results.push_back(oldGroups[i][j]->copy()); // point index start from 0 - } - } + } while (!flagToStop); + if (param.kmeanspp) { + SESAME_INFO("KMeans++ sourceEnd!!!"); + } else { + SESAME_INFO("KMeans sourceEnd!!!"); + } + for (int i = 0; i < oldGroups.size(); i++) { + for (int j = 0; j < oldGroups[i].size(); j++) { + oldGroups[i][j]->setClusteringCenter(i); + results.push_back(oldGroups[i][j]->copy()); // point index start from 0 + } } + } } diff --git a/src/Algorithm/OfflineRefinement/OfflineRefinement.cpp b/src/Algorithm/OfflineRefinement/OfflineRefinement.cpp index 5b19d00a..87cb0d99 100644 --- a/src/Algorithm/OfflineRefinement/OfflineRefinement.cpp +++ b/src/Algorithm/OfflineRefinement/OfflineRefinement.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 19/07/2021. diff --git a/src/Algorithm/SlidingWindowClustering.cpp b/src/Algorithm/SlidingWindowClustering.cpp index 638a6513..77c17504 100644 --- a/src/Algorithm/SlidingWindowClustering.cpp +++ b/src/Algorithm/SlidingWindowClustering.cpp @@ -3,176 +3,158 @@ #include -namespace SESAME -{ -SlidingWindowClustering::SlidingWindowClustering(param_t &cmd_params) : r(cmd_params.seed) -{ - this->param = cmd_params; +namespace SESAME { +SlidingWindowClustering::SlidingWindowClustering(param_t &cmd_params) + : r(cmd_params.seed) { + this->param = cmd_params; } SlidingWindowClustering::~SlidingWindowClustering() {} void SlidingWindowClustering::Init() { sum_timer.Tick(); } -void k_means_plus_plus(Random *r, const std::vector> &instance, - int32_t k, std::vector *centers, double *cost) -{ - centers->clear(); +void k_means_plus_plus(Random *r, + const std::vector> &instance, + int32_t k, std::vector *centers, double *cost) { + centers->clear(); - std::vector min_distance_to_centers(instance.size(), - std::numeric_limits::max()); + std::vector min_distance_to_centers( + instance.size(), std::numeric_limits::max()); - if (k >= instance.size()) - { - for (int32_t i = 0; i < instance.size(); i++) - { - centers->push_back(i); - } + if (k >= instance.size()) { + for (int32_t i = 0; i < instance.size(); i++) { + centers->push_back(i); } - else - { - // add u.a.r. center. - auto index = r->random_uniform(0, (int)instance.size() - 1); - centers->push_back(index); - while (centers->size() < k) - { - double sum_pow_min_distances = 0.0; - vector min_dist_powers; - min_dist_powers.reserve(instance.size()); - - for (int pos = 0; pos < instance.size(); ++pos) - { - double min_distance = - std::min(min_distance_to_centers.at(pos), - instance[pos].first->L2Dist(instance.at(centers->back()).first)); - min_distance_to_centers[pos] = min_distance; - sum_pow_min_distances += std::pow(min_distance, 2) * instance.at(pos).second; - min_dist_powers.push_back(std::pow(min_distance, 2) * instance.at(pos).second); - } - - double random_place = r->random_uniform(0.0, sum_pow_min_distances); - for (int32_t i = 0; i < instance.size(); i++) - { - if (random_place <= min_dist_powers[i]) - { - centers->push_back(i); - break; - } - random_place -= min_dist_powers[i]; - } + } else { + // add u.a.r. center. + auto index = r->random_uniform(0, (int)instance.size() - 1); + centers->push_back(index); + while (centers->size() < k) { + double sum_pow_min_distances = 0.0; + vector min_dist_powers; + min_dist_powers.reserve(instance.size()); + + for (int pos = 0; pos < instance.size(); ++pos) { + double min_distance = std::min( + min_distance_to_centers.at(pos), + instance[pos].first->L2Dist(instance.at(centers->back()).first)); + min_distance_to_centers[pos] = min_distance; + sum_pow_min_distances += + std::pow(min_distance, 2) * instance.at(pos).second; + min_dist_powers.push_back(std::pow(min_distance, 2) * + instance.at(pos).second); + } + + double random_place = r->random_uniform(0.0, sum_pow_min_distances); + for (int32_t i = 0; i < instance.size(); i++) { + if (random_place <= min_dist_powers[i]) { + centers->push_back(i); + break; } + random_place -= min_dist_powers[i]; + } } - - *cost = 0; - for (int pos = 0; pos < instance.size(); ++pos) - { - double min_distance = - std::min(min_distance_to_centers.at(pos), - instance[pos].first->L2Dist(instance[centers->back()].first)); - *cost += std::pow(min_distance, 2) * instance.at(pos).second; - } + } + + *cost = 0; + for (int pos = 0; pos < instance.size(); ++pos) { + double min_distance = + std::min(min_distance_to_centers.at(pos), + instance[pos].first->L2Dist(instance[centers->back()].first)); + *cost += std::pow(min_distance, 2) * instance.at(pos).second; + } } // Given a series of instances of the problem, runs the k-means++ algorithm on // each instance and outputs the vector of costs of the solutions found. Any // other algorithm could be used instead of k-means++. -vector cost_samples(Random *r, const vector &samples, int window_size, - int num_samples, int k) -{ - vector costs; - costs.reserve(num_samples); - for (int i = 0; i < num_samples; i++) - { - std::vector> instance; - for (int j = 0; j < window_size; j++) - { - instance.push_back(std::make_pair(samples[i * window_size + j], 1.0)); - } - std::vector ingnored_centers; - double cost; - k_means_plus_plus(r, instance, k, &ingnored_centers, &cost); - if (cost > 0) - { - costs.push_back(cost); - } +vector cost_samples(Random *r, const vector &samples, + int window_size, int num_samples, int k) { + vector costs; + costs.reserve(num_samples); + for (int i = 0; i < num_samples; i++) { + std::vector> instance; + for (int j = 0; j < window_size; j++) { + instance.push_back(std::make_pair(samples[i * window_size + j], 1.0)); + } + std::vector ingnored_centers; + double cost; + k_means_plus_plus(r, instance, k, &ingnored_centers, &cost); + if (cost > 0) { + costs.push_back(cost); } - return costs; + } + return costs; } // Given a vector of costs, guesses a range for the optimum value. This is done // by using a heuristic based on the min/max value and the standard deviation. // Outputs the min and max bounds as a pair. -std::pair guess_bounds(const vector &costs) -{ - auto min_max_it = std::minmax_element(costs.begin(), costs.end()); - double mean = std::accumulate(std::begin(costs), std::end(costs), double{0}) / costs.size(); - std::vector costs_sq; - costs_sq.resize(costs.size()); - for (const auto &cost : costs) - { - costs_sq.push_back(cost * cost); - } - double mean_sq = - std::accumulate(std::begin(costs_sq), std::end(costs_sq), double{0}) / costs.size(); - double stddev = std::sqrt(mean_sq - mean * mean); - - double lowerbound = std::max(*min_max_it.first / 3, mean - 3.0 * stddev); - double upperbound = std::max(mean + 3.0 * stddev, *min_max_it.second * 3); - - return std::make_pair(lowerbound, upperbound); +std::pair guess_bounds(const vector &costs) { + auto min_max_it = std::minmax_element(costs.begin(), costs.end()); + double mean = std::accumulate(std::begin(costs), std::end(costs), double{0}) / + costs.size(); + std::vector costs_sq; + costs_sq.resize(costs.size()); + for (const auto &cost : costs) { + costs_sq.push_back(cost * cost); + } + double mean_sq = + std::accumulate(std::begin(costs_sq), std::end(costs_sq), double{0}) / + costs.size(); + double stddev = std::sqrt(mean_sq - mean * mean); + + double lowerbound = std::max(*min_max_it.first / 3, mean - 3.0 * stddev); + double upperbound = std::max(mean + 3.0 * stddev, *min_max_it.second * 3); + + return std::make_pair(lowerbound, upperbound); } -std::pair guess_optimum_range_bounds(Random *r, const vector &samples, - int window_size, int num_samples, int k) -{ - auto costs = cost_samples(r, samples, window_size, num_samples, k); - return guess_bounds(costs); +std::pair +guess_optimum_range_bounds(Random *r, const vector &samples, + int window_size, int num_samples, int k) { + auto costs = cost_samples(r, samples, window_size, num_samples, k); + return guess_bounds(costs); } -void SlidingWindowClustering::RunOnline(PointPtr input) -{ - ++count; - if (!has_sampled) - { - win_timer.Tick(); - if (samples.size() < param.num_samples * param.sliding) samples.push_back(input); - if (samples.size() >= param.num_samples * param.sliding) - { - const auto &[lower_bound, upper_bound] = guess_optimum_range_bounds( - &r, samples, param.sliding, param.num_samples, param.num_clusters); - framework = GenericFactory::New>( - &r, param.sliding, param.num_clusters, param.delta_grid, lower_bound, upper_bound); - for (auto p : samples) - { - framework->process_point(p); - } - has_sampled = true; - } - win_timer.Tock(); - } - else - { - ds_timer.Tick(); - framework->process_point(input); - ds_timer.Tock(); +void SlidingWindowClustering::RunOnline(PointPtr input) { + ++count; + if (!has_sampled) { + win_timer.Tick(); + if (samples.size() < param.num_samples * param.sliding) + samples.push_back(input); + if (samples.size() >= param.num_samples * param.sliding) { + const auto &[lower_bound, upper_bound] = guess_optimum_range_bounds( + &r, samples, param.sliding, param.num_samples, param.num_clusters); + framework = GenericFactory::New>( + &r, param.sliding, param.num_clusters, param.delta_grid, lower_bound, + upper_bound); + for (auto p : samples) { + framework->process_point(p); + } + has_sampled = true; } - lat_timer.Add(input->toa); + win_timer.Tock(); + } else { + ds_timer.Tick(); + framework->process_point(input); + ds_timer.Tock(); + } + lat_timer.Add(input->toa); } -void SlidingWindowClustering::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - std::vector online_centers; - double cost_estimate = 0; - framework->solution(&online_centers, &cost_estimate); - for (int i = 0; i < online_centers.size(); i++) - { - online_centers[i]->clu_id = i; - sinkPtr->put(online_centers[i]); - } - ref_timer.Tock(); - sum_timer.Tock(); +void SlidingWindowClustering::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + std::vector online_centers; + double cost_estimate = 0; + framework->solution(&online_centers, &cost_estimate); + for (int i = 0; i < online_centers.size(); i++) { + online_centers[i]->clu_id = i; + sinkPtr->put(online_centers[i]); + } + ref_timer.Tock(); + sum_timer.Tock(); } -} // namespace SESAME \ No newline at end of file +} // namespace SESAME \ No newline at end of file diff --git a/src/Algorithm/StreamKM.cpp b/src/Algorithm/StreamKM.cpp index d4259c53..591e5cb6 100644 --- a/src/Algorithm/StreamKM.cpp +++ b/src/Algorithm/StreamKM.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by tuidan on 2021/7/21. @@ -11,671 +12,559 @@ #include #include -void SESAME::StreamKM::Init() -{ - // initial the landmark window - UtilityFunctions::init_genrand(this->StreamKMParam.seed); - this->window = WindowFactory::createLandmarkWindow(); - this->window->windowManager.numberOfWindow = - ceil(log((double)this->StreamKMParam.num_points / (double)this->StreamKMParam.windowSize) / - log(2)) + - 2; - this->window->windowManager.maxWindowSize = this->StreamKMParam.windowSize; - this->window->initWindow(this->StreamKMParam.windowSize); - this->window->tree = GenericFactory::New(param); - SESAME_DEBUG("Created manager with " << this->window->windowManager.numberOfWindow - << " windows of dim: " << this->StreamKMParam.dim); - sum_timer.Tick(); +void SESAME::StreamKM::Init() { + // initial the landmark window + UtilityFunctions::init_genrand(this->StreamKMParam.seed); + this->window = WindowFactory::createLandmarkWindow(); + this->window->windowManager.numberOfWindow = + ceil(log((double)this->StreamKMParam.num_points / + (double)this->StreamKMParam.windowSize) / + log(2)) + + 2; + this->window->windowManager.maxWindowSize = this->StreamKMParam.windowSize; + this->window->initWindow(this->StreamKMParam.windowSize); + this->window->tree = GenericFactory::New(param); + SESAME_DEBUG("Created manager with " + << this->window->windowManager.numberOfWindow + << " windows of dim: " << this->StreamKMParam.dim); + sum_timer.Tick(); } /** - * @Description: build the landmark Window, insert the data point and construct the coreset tree if - * the window is full + * @Description: build the landmark Window, insert the data point and construct + * the coreset tree if the window is full * @Param: - * @Return: although void, but actually we store the output result(with computed clustering center) - * into this->streamingCoreset + * @Return: although void, but actually we store the output result(with computed + * clustering center) into this->streamingCoreset */ -void SESAME::StreamKM::RunOnline(const SESAME::PointPtr input) -{ - ds_timer.Tick(); - this->window->insertPoint(input); - ds_timer.Tock(); - lat_timer.Add(input->toa); +void SESAME::StreamKM::RunOnline(const SESAME::PointPtr input) { + ds_timer.Tick(); + this->window->insertPoint(input); + ds_timer.Tock(); + lat_timer.Add(input->toa); } /** - * @Description: we run offline KMeans++ algorithm 5 times using the final m coreset - * points(this->streamingCoreset)[m>k] + * @Description: we run offline KMeans++ algorithm 5 times using the final m + * coreset points(this->streamingCoreset)[m>k] * @param num_clusters * @param coreset_size * @param dim * @param output */ -void SESAME::StreamKM::RunOffline(DataSinkPtr sinkPtr) -{ - on_timer.Add(sum_timer.start); - ref_timer.Tick(); - this->window->getCoresetFromManager( - this->streamingCoreset); // streamingCoreset = - // LandmarkWindow::getCoresetFromManager(manager); - int parNumber = this->streamingCoreset.size(); - for (int i = 0; i < parNumber; i++) - { - for (int j = 0; j < this->StreamKMParam.dim; j++) - { - this->streamingCoreset[i]->setFeatureItem(this->streamingCoreset[i]->getFeatureItem(j) / - this->streamingCoreset[i]->getWeight(), - j); - } - } - if (param.run_offline) - { - vector centers; - vector> groups; - std::vector> oldGroups, newGroups; - this->km.runKMeans(this->streamingCoreset.size(), parNumber, centers, - this->streamingCoreset, oldGroups, newGroups, this->StreamKMParam.seed, - true); - // store the result input output - this->km.produceResult(oldGroups, sinkPtr); +void SESAME::StreamKM::RunOffline(DataSinkPtr sinkPtr) { + on_timer.Add(sum_timer.start); + ref_timer.Tick(); + this->window->getCoresetFromManager( + this->streamingCoreset); // streamingCoreset = + // LandmarkWindow::getCoresetFromManager(manager); + int parNumber = this->streamingCoreset.size(); + for (int i = 0; i < parNumber; i++) { + for (int j = 0; j < this->StreamKMParam.dim; j++) { + this->streamingCoreset[i]->setFeatureItem( + this->streamingCoreset[i]->getFeatureItem(j) / + this->streamingCoreset[i]->getWeight(), + j); } - else - { - for (int i = 0; i < parNumber; i++) - { - streamingCoreset[i]->setClusteringCenter(i); - sinkPtr->put(streamingCoreset[i]); - } + } + if (param.run_offline) { + vector centers; + vector> groups; + std::vector> oldGroups, newGroups; + this->km.runKMeans(this->streamingCoreset.size(), parNumber, centers, + this->streamingCoreset, oldGroups, newGroups, + this->StreamKMParam.seed, true); + // store the result input output + this->km.produceResult(oldGroups, sinkPtr); + } else { + for (int i = 0; i < parNumber; i++) { + streamingCoreset[i]->setClusteringCenter(i); + sinkPtr->put(streamingCoreset[i]); } - ref_timer.Tock(); - sum_timer.Tock(); + } + ref_timer.Tock(); + sum_timer.Tock(); } -SESAME::StreamKM::StreamKM(param_t &cmd_params) -{ - this->param = cmd_params; - this->StreamKMParam.num_points = cmd_params.num_points; - this->StreamKMParam.num_clusters = cmd_params.k; - this->StreamKMParam.windowSize = cmd_params.coreset_size; - this->StreamKMParam.seed = cmd_params.seed; - this->StreamKMParam.dim = cmd_params.dim; +SESAME::StreamKM::StreamKM(param_t &cmd_params) { + this->param = cmd_params; + this->StreamKMParam.num_points = cmd_params.num_points; + this->StreamKMParam.num_clusters = cmd_params.k; + this->StreamKMParam.windowSize = cmd_params.coreset_size; + this->StreamKMParam.seed = cmd_params.seed; + this->StreamKMParam.dim = cmd_params.dim; } SESAME::StreamKM::~StreamKM() {} // TODO: convert it as general tree model -void SESAME::LandmarkWindow::CoresetTree::unionTreeCoreset(int k, int n_1, int n_2, - std::vector &setA, - std::vector &setB, - std::vector ¢res) -{ - // SESAME_DEBUG("Computing coreset..."); - // total number of points - int n = n_1 + n_2; - // choose the first centre (each point has the same probability of being choosen) - // stores, how many centres have been choosen yet - int choosenPoints = 0; - // only choose from the n-i points not already choosen - int j = UtilityFunctions::genrand_int31() % (n - choosenPoints); - - // copy the choosen point - if (j < n_1) - { - centres[choosenPoints] = setA[j]->copy(); // TODO: ???? why re-set setA[j]? +void SESAME::LandmarkWindow::CoresetTree::unionTreeCoreset( + int k, int n_1, int n_2, std::vector &setA, + std::vector &setB, std::vector ¢res) { + // SESAME_DEBUG("Computing coreset..."); + // total number of points + int n = n_1 + n_2; + // choose the first centre (each point has the same probability of being + // choosen) stores, how many centres have been choosen yet + int choosenPoints = 0; + // only choose from the n-i points not already choosen + int j = UtilityFunctions::genrand_int31() % (n - choosenPoints); + + // copy the choosen point + if (j < n_1) { + centres[choosenPoints] = setA[j]->copy(); // TODO: ???? why re-set setA[j]? + } else { + j = j - n_1; + centres[choosenPoints] = setB[j]->copy(); // TODO: ???? why re-set setB[j]? + } + // struct treeNode *root = (struct treeNode *) malloc(sizeof(struct + // treeNode)); + TreeNodePtr root = DataStructureFactory::createTreeNode(); + constructRoot(root, setA, setB, n_1, n_2, centres[choosenPoints], + choosenPoints); + choosenPoints = 1; + + // choose the remaining points + while (choosenPoints < k) { + if (root->cost > 0.0) { + TreeNodePtr leaf = selectNode(root); + PointPtr centre = chooseCentre(leaf); + split(leaf, centre, choosenPoints); + centres[choosenPoints] = centre->copy(); + } else { + // create a dummy point + centres[choosenPoints] = root->centre->copy(); + int l; + for (l = 0; l < centres[choosenPoints]->getDimension(); l++) { + centres[choosenPoints]->setFeatureItem(-1 * 1000000, l); + } + centres[choosenPoints]->setIndex(-1); + centres[choosenPoints]->setWeight(0.0); } - else - { - j = j - n_1; - centres[choosenPoints] = setB[j]->copy(); // TODO: ???? why re-set setB[j]? - } - // struct treeNode *root = (struct treeNode *) malloc(sizeof(struct treeNode)); - TreeNodePtr root = DataStructureFactory::createTreeNode(); - constructRoot(root, setA, setB, n_1, n_2, centres[choosenPoints], choosenPoints); - choosenPoints = 1; - - // choose the remaining points - while (choosenPoints < k) - { - if (root->cost > 0.0) - { - TreeNodePtr leaf = selectNode(root); - PointPtr centre = chooseCentre(leaf); - split(leaf, centre, choosenPoints); - centres[choosenPoints] = centre->copy(); - } - else - { - // create a dummy point - centres[choosenPoints] = root->centre->copy(); - int l; - for (l = 0; l < centres[choosenPoints]->getDimension(); l++) - { - centres[choosenPoints]->setFeatureItem(-1 * 1000000, l); - } - centres[choosenPoints]->setIndex(-1); - centres[choosenPoints]->setWeight(0.0); - } - choosenPoints++; - } + choosenPoints++; + } - // free the tree - freeTree(root); - - // recalculate clustering features - int i; - for (i = 0; i < n; i++) - { - if (i < n_1) - { - int index = setA[i]->getClusteringCenter(); - if (centres[index]->getIndex() != setA[i]->getIndex()) - { - centres[index]->setWeight(centres[index]->getWeight() + setA[i]->getWeight()); - int l; - for (l = 0; l < setA[i]->getDimension(); l++) - { - if (setA[i]->getWeight() != 0.0) - { - centres[index]->setFeatureItem( - setA[i]->getFeatureItem(l) + centres[index]->getFeatureItem(l), l); - } - } - } - } - else - { - int index = setB[i - n_1]->getClusteringCenter(); - if (centres[index]->getIndex() != setB[i - n_1]->getIndex()) - { - centres[index]->setWeight(centres[index]->getWeight() + setB[i - n_1]->getWeight()); - int l; - for (l = 0; l < setB[i - n_1]->getDimension(); l++) - { - if (setB[i - n_1]->getWeight() != 0.0) - { - centres[index]->setFeatureItem( - setB[i - n_1]->getFeatureItem(l) + centres[index]->getFeatureItem(l), - l); - } - } - } - } + // free the tree + freeTree(root); + + // recalculate clustering features + int i; + for (i = 0; i < n; i++) { + if (i < n_1) { + int index = setA[i]->getClusteringCenter(); + if (centres[index]->getIndex() != setA[i]->getIndex()) { + centres[index]->setWeight(centres[index]->getWeight() + + setA[i]->getWeight()); + int l; + for (l = 0; l < setA[i]->getDimension(); l++) { + if (setA[i]->getWeight() != 0.0) { + centres[index]->setFeatureItem( + setA[i]->getFeatureItem(l) + centres[index]->getFeatureItem(l), + l); + } + } + } + } else { + int index = setB[i - n_1]->getClusteringCenter(); + if (centres[index]->getIndex() != setB[i - n_1]->getIndex()) { + centres[index]->setWeight(centres[index]->getWeight() + + setB[i - n_1]->getWeight()); + int l; + for (l = 0; l < setB[i - n_1]->getDimension(); l++) { + if (setB[i - n_1]->getWeight() != 0.0) { + centres[index]->setFeatureItem( + setB[i - n_1]->getFeatureItem(l) + + centres[index]->getFeatureItem(l), + l); + } + } + } } + } } -void SESAME::LandmarkWindow::CoresetTree::freeTree(TreeNodePtr root) -{ - while (!treeFinished(root)) - { - if (root->lc == NULL && root->rc == NULL) - { - root = root->parent; - } - else if (root->lc == NULL && root->rc != NULL) - { - // Schau ob rc ein Blatt ist - if (isLeaf(root->rc)) - { - // Gebe rechtes Kind frei - root->rc->points.clear(); - DataStructureFactory::clearTreeNode(root->rc); - root->rc = NULL; - } - else - { - // Fahre mit rechtem Kind fort - root = root->rc; - } - } - else if (root->lc != NULL) - { - if (isLeaf(root->lc)) - { - root->lc->points.clear(); - DataStructureFactory::clearTreeNode(root->lc); - root->lc = NULL; - } - else - { - root = root->lc; - } - } +void SESAME::LandmarkWindow::CoresetTree::freeTree(TreeNodePtr root) { + while (!treeFinished(root)) { + if (root->lc == NULL && root->rc == NULL) { + root = root->parent; + } else if (root->lc == NULL && root->rc != NULL) { + // Schau ob rc ein Blatt ist + if (isLeaf(root->rc)) { + // Gebe rechtes Kind frei + root->rc->points.clear(); + DataStructureFactory::clearTreeNode(root->rc); + root->rc = NULL; + } else { + // Fahre mit rechtem Kind fort + root = root->rc; + } + } else if (root->lc != NULL) { + if (isLeaf(root->lc)) { + root->lc->points.clear(); + DataStructureFactory::clearTreeNode(root->lc); + root->lc = NULL; + } else { + root = root->lc; + } } - root->points.clear(); - root.reset(); + } + root->points.clear(); + root.reset(); } -bool SESAME::LandmarkWindow::CoresetTree::treeFinished(TreeNodePtr root) -{ - if (root->parent == NULL && root->lc == NULL && root->rc == NULL) - { - return 1; - } - else - { - return 0; - } +bool SESAME::LandmarkWindow::CoresetTree::treeFinished(TreeNodePtr root) { + if (root->parent == NULL && root->lc == NULL && root->rc == NULL) { + return 1; + } else { + return 0; + } } -bool SESAME::LandmarkWindow::CoresetTree::isLeaf(TreeNodePtr node) -{ - if (node->lc == NULL && node->rc == NULL) - { - return 1; - } - else - { - return 0; - } +bool SESAME::LandmarkWindow::CoresetTree::isLeaf(TreeNodePtr node) { + if (node->lc == NULL && node->rc == NULL) { + return 1; + } else { + return 0; + } } -void SESAME::LandmarkWindow::CoresetTree::constructRoot(TreeNodePtr root, - std::vector &setA, - std::vector &setB, int n_1, - int n_2, PointPtr centre, int centreIndex) -{ - // loop counter variable - int i; - // the root has no parent and no child nodes in the beginning - root->parent = NULL; - root->lc = NULL; - root->rc = NULL; - - // array with points to the points - // root->points= new Point[n_1 + n_2]; - root->n = n_1 + n_2; - - for (i = 0; i < root->n; i++) - { - if (i < n_1) - { - root->points.push_back(setA[i]); - // root->points[i] = setA[i]; - root->points[i]->setClusteringCenter(centreIndex); - } - else - { - root->points.push_back(setB[i - n_1]); - root->points[i]->setClusteringCenter(centreIndex); - } +void SESAME::LandmarkWindow::CoresetTree::constructRoot( + TreeNodePtr root, std::vector &setA, std::vector &setB, + int n_1, int n_2, PointPtr centre, int centreIndex) { + // loop counter variable + int i; + // the root has no parent and no child nodes in the beginning + root->parent = NULL; + root->lc = NULL; + root->rc = NULL; + + // array with points to the points + // root->points= new Point[n_1 + n_2]; + root->n = n_1 + n_2; + + for (i = 0; i < root->n; i++) { + if (i < n_1) { + root->points.push_back(setA[i]); + // root->points[i] = setA[i]; + root->points[i]->setClusteringCenter(centreIndex); + } else { + root->points.push_back(setB[i - n_1]); + root->points[i]->setClusteringCenter(centreIndex); } + } - // set the centre - root->centre = centre; + // set the centre + root->centre = centre; - // calculate costs - treeNodeTargetFunctionValue(root); + // calculate costs + treeNodeTargetFunctionValue(root); } -void SESAME::LandmarkWindow::CoresetTree::treeNodeTargetFunctionValue(TreeNodePtr node) -{ - // loop counter variable - int i; +void SESAME::LandmarkWindow::CoresetTree::treeNodeTargetFunctionValue( + TreeNodePtr node) { + // loop counter variable + int i; - // stores the cost - double sum = 0.0; + // stores the cost + double sum = 0.0; - for (i = 0; i < node->n; i++) - { - // stores the distance - double distance = 0.0; + for (i = 0; i < node->n; i++) { + // stores the distance + double distance = 0.0; - // loop counter variable - int l; + // loop counter variable + int l; - for (l = 0; l < node->points[i]->getDimension(); l++) - { - // centroid coordinate of the point - double centroidCoordinatePoint; - if (node->points[i]->getWeight() != 0.0) - { - centroidCoordinatePoint = - node->points[i]->getFeatureItem(l) / node->points[i]->getWeight(); - } - else - { - centroidCoordinatePoint = node->points[i]->getFeatureItem(l); - } - // centroid coordinate of the centre - double centroidCoordinateCentre; - if (node->centre->getWeight() != 0.0) - { - centroidCoordinateCentre = - node->centre->getFeatureItem(l) / node->centre->getWeight(); - } - else - { - centroidCoordinateCentre = node->centre->getFeatureItem(l); - } - distance += (centroidCoordinatePoint - centroidCoordinateCentre) * - (centroidCoordinatePoint - centroidCoordinateCentre); - } - sum += distance * node->points[i]->getWeight(); + for (l = 0; l < node->points[i]->getDimension(); l++) { + // centroid coordinate of the point + double centroidCoordinatePoint; + if (node->points[i]->getWeight() != 0.0) { + centroidCoordinatePoint = + node->points[i]->getFeatureItem(l) / node->points[i]->getWeight(); + } else { + centroidCoordinatePoint = node->points[i]->getFeatureItem(l); + } + // centroid coordinate of the centre + double centroidCoordinateCentre; + if (node->centre->getWeight() != 0.0) { + centroidCoordinateCentre = + node->centre->getFeatureItem(l) / node->centre->getWeight(); + } else { + centroidCoordinateCentre = node->centre->getFeatureItem(l); + } + distance += (centroidCoordinatePoint - centroidCoordinateCentre) * + (centroidCoordinatePoint - centroidCoordinateCentre); } - node->cost = sum; + sum += distance * node->points[i]->getWeight(); + } + node->cost = sum; } -SESAME::TreeNodePtr SESAME::LandmarkWindow::CoresetTree::selectNode(TreeNodePtr root) -{ - // random number between 0 and 1 - double random = UtilityFunctions::genrand_real3(); - while (!isLeaf(root)) - { - if (root->lc->cost == 0 && root->rc->cost == 0) - { - if (root->lc->n == 0) - { - root = root->rc; - } - else if (root->rc->n == 0) - { - root = root->lc; - } - else if (random < 0.5) - { - random = UtilityFunctions::genrand_real3(); - root = root->lc; - } - else - { - random = UtilityFunctions::genrand_real3(); - root = root->rc; - } - } - else - { - if (random < root->lc->cost / root->cost) - { - root = root->lc; - } - else - { - root = root->rc; - } - } +SESAME::TreeNodePtr +SESAME::LandmarkWindow::CoresetTree::selectNode(TreeNodePtr root) { + // random number between 0 and 1 + double random = UtilityFunctions::genrand_real3(); + while (!isLeaf(root)) { + if (root->lc->cost == 0 && root->rc->cost == 0) { + if (root->lc->n == 0) { + root = root->rc; + } else if (root->rc->n == 0) { + root = root->lc; + } else if (random < 0.5) { + random = UtilityFunctions::genrand_real3(); + root = root->lc; + } else { + random = UtilityFunctions::genrand_real3(); + root = root->rc; + } + } else { + if (random < root->lc->cost / root->cost) { + root = root->lc; + } else { + root = root->rc; + } } + } - return root; + return root; } /** * selects a new centre from the treenode (using the kMeans++ distribution) * TODO: Why hard-code?? */ -SESAME::PointPtr SESAME::LandmarkWindow::CoresetTree::chooseCentre(TreeNodePtr node) -{ - // TODO: How many times should we try to choose a centre ?? - int times = 3; +SESAME::PointPtr +SESAME::LandmarkWindow::CoresetTree::chooseCentre(TreeNodePtr node) { + // TODO: How many times should we try to choose a centre ?? + int times = 3; - // stores the nodecost if node is split with the best centre - double minCost = node->cost; - PointPtr bestCentre = DataStructureFactory::createPoint(param.dim); + // stores the nodecost if node is split with the best centre + double minCost = node->cost; + PointPtr bestCentre = GenericFactory::New(param.dim); - // loop counter variable - int i; - int j; - - for (j = 0; j < times; j++) - { - // sum of the relativ cost of the points - double sum = 0.0; - // random number between 0 and 1 - double random = UtilityFunctions::genrand_real3(); - - for (i = 0; i < node->n; i++) - { - sum += treeNodeCostOfPoint(node, node->points[i]) / node->cost; - if (sum >= random) - { - if (node->points[i]->getWeight() == 0.0) - { - SESAME_INFO("ERROR: CHOOSEN DUMMY NODE THOUGH OTHER AVAILABLE \n"); - return bestCentre; - } - double curCost = treeNodeSplitCost(node, node->centre, node->points[i]); - if (curCost < minCost) - { - bestCentre = node->points[i]; - minCost = curCost; - } - break; - } + for (int j = 0; j < times; j++) { + // sum of the relativ cost of the points + double sum = 0.0; + // random number between 0 and 1 + double random = UtilityFunctions::genrand_real3(); + + for (int i = 0; i < node->n; i++) { + sum += treeNodeCostOfPoint(node, node->points[i]) / node->cost; + if (sum >= random) { + if (node->points[i]->getWeight() == 0.0) { + SESAME_INFO("ERROR: CHOOSEN DUMMY NODE THOUGH OTHER AVAILABLE \n"); + return bestCentre; } + double curCost = treeNodeSplitCost(node, node->centre, node->points[i]); + if (curCost < minCost) { + bestCentre = node->points[i]; + minCost = curCost; + } + break; + } } - if (bestCentre->getIndex() == -1) - { - return node->points[0]; - } - else - { - return bestCentre; - } + } + if (bestCentre->getIndex() == -1) { + return node->points[0]; + } else { + return bestCentre; + } } -double SESAME::LandmarkWindow::CoresetTree::treeNodeCostOfPoint(TreeNodePtr node, PointPtr p) -{ - if (p->getWeight() == 0.0) - { - return 0.0; +double +SESAME::LandmarkWindow::CoresetTree::treeNodeCostOfPoint(TreeNodePtr node, + PointPtr p) { + if (p->getWeight() == 0.0) { + return 0.0; + } + + // stores the distance between centre and p + double distance = 0.0; + + // loop counter variable + int l; + + for (l = 0; l < p->getDimension(); l++) { + // centroid coor->inate of the point + double centroidCoordinatePoint; + if (p->getWeight() != 0.0) { + centroidCoordinatePoint = p->getFeatureItem(l) / p->getWeight(); + } else { + centroidCoordinatePoint = p->getFeatureItem(l); } - - // stores the distance between centre and p - double distance = 0.0; - - // loop counter variable - int l; - - for (l = 0; l < p->getDimension(); l++) - { - // centroid coor->inate of the point - double centroidCoordinatePoint; - if (p->getWeight() != 0.0) - { - centroidCoordinatePoint = p->getFeatureItem(l) / p->getWeight(); - } - else - { - centroidCoordinatePoint = p->getFeatureItem(l); - } - // centroid coordinate of the centre - double centroidCoordinateCentre; - if (node->centre->getWeight() != 0.0) - { - centroidCoordinateCentre = node->centre->getFeatureItem(l) / node->centre->getWeight(); - } - else - { - centroidCoordinateCentre = node->centre->getFeatureItem(l); - } - distance += (centroidCoordinatePoint - centroidCoordinateCentre) * - (centroidCoordinatePoint - centroidCoordinateCentre); + // centroid coordinate of the centre + double centroidCoordinateCentre; + if (node->centre->getWeight() != 0.0) { + centroidCoordinateCentre = + node->centre->getFeatureItem(l) / node->centre->getWeight(); + } else { + centroidCoordinateCentre = node->centre->getFeatureItem(l); } - return distance * p->getWeight(); + distance += (centroidCoordinatePoint - centroidCoordinateCentre) * + (centroidCoordinatePoint - centroidCoordinateCentre); + } + return distance * p->getWeight(); } /** - * computes the hypothetical cost if the node would be split with new centers centreA, centreB + * computes the hypothetical cost if the node would be split with new centers + * centreA, centreB * @param node * @param centreA * @param centreB * @return */ -double SESAME::LandmarkWindow::CoresetTree::treeNodeSplitCost(TreeNodePtr node, PointPtr centreA, - PointPtr centreB) -{ +double SESAME::LandmarkWindow::CoresetTree::treeNodeSplitCost( + TreeNodePtr node, PointPtr centreA, PointPtr centreB) { + // loop counter variable + int i; + // stores the cost + double sum = 0.0; + for (i = 0; i < node->n; i++) { // loop counter variable - int i; - // stores the cost - double sum = 0.0; - for (i = 0; i < node->n; i++) - { - // loop counter variable - int l; - // stores the distance between p and centreA - double distanceA = 0.0; - for (l = 0; l < node->points[i]->getDimension(); l++) - { - // centroid coordinate of the point - double centroidCoordinatePoint; - if (node->points[i]->getWeight() != 0.0) - { - centroidCoordinatePoint = - node->points[i]->getFeatureItem(l) / node->points[i]->getWeight(); - } - else - { - centroidCoordinatePoint = node->points[i]->getFeatureItem(l); - } - // centroid coordinate of the centre - double centroidCoordinateCentre; - if (centreA->getWeight() != 0.0) - { - centroidCoordinateCentre = centreA->getFeatureItem(l) / centreA->getWeight(); - } - else - { - centroidCoordinateCentre = centreA->getFeatureItem(l); - } - distanceA += (centroidCoordinatePoint - centroidCoordinateCentre) * - (centroidCoordinatePoint - centroidCoordinateCentre); - } - // stores the distance between p and centreB - double distanceB = 0.0; - for (l = 0; l < node->points[i]->getDimension(); l++) - { - // centroid coordinate of the point - double centroidCoordinatePoint; - if (node->points[i]->getWeight() != 0.0) - { - centroidCoordinatePoint = - node->points[i]->getFeatureItem(l) / node->points[i]->getWeight(); - } - else - { - centroidCoordinatePoint = node->points[i]->getFeatureItem(l); - } - // centroid coordinate of the centre - double centroidCoordinateCentre; - if (centreB->getWeight() != 0.0) - { - centroidCoordinateCentre = centreB->getFeatureItem(l) / centreB->getWeight(); - } - else - { - centroidCoordinateCentre = centreB->getFeatureItem(l); - } - - distanceB += (centroidCoordinatePoint - centroidCoordinateCentre) * - (centroidCoordinatePoint - centroidCoordinateCentre); - } - // add the cost of the closest centre to the sum - if (distanceA < distanceB) - { - sum += distanceA * node->points[i]->getWeight(); - } - else - { - sum += distanceB * node->points[i]->getWeight(); - } + int l; + // stores the distance between p and centreA + double distanceA = 0.0; + for (l = 0; l < node->points[i]->getDimension(); l++) { + // centroid coordinate of the point + double centroidCoordinatePoint; + if (node->points[i]->getWeight() != 0.0) { + centroidCoordinatePoint = + node->points[i]->getFeatureItem(l) / node->points[i]->getWeight(); + } else { + centroidCoordinatePoint = node->points[i]->getFeatureItem(l); + } + // centroid coordinate of the centre + double centroidCoordinateCentre; + if (centreA->getWeight() != 0.0) { + centroidCoordinateCentre = + centreA->getFeatureItem(l) / centreA->getWeight(); + } else { + centroidCoordinateCentre = centreA->getFeatureItem(l); + } + distanceA += (centroidCoordinatePoint - centroidCoordinateCentre) * + (centroidCoordinatePoint - centroidCoordinateCentre); } - // return the total cost - return sum; + // stores the distance between p and centreB + double distanceB = 0.0; + for (l = 0; l < node->points[i]->getDimension(); l++) { + // centroid coordinate of the point + double centroidCoordinatePoint; + if (node->points[i]->getWeight() != 0.0) { + centroidCoordinatePoint = + node->points[i]->getFeatureItem(l) / node->points[i]->getWeight(); + } else { + centroidCoordinatePoint = node->points[i]->getFeatureItem(l); + } + // centroid coordinate of the centre + double centroidCoordinateCentre; + if (centreB->getWeight() != 0.0) { + centroidCoordinateCentre = + centreB->getFeatureItem(l) / centreB->getWeight(); + } else { + centroidCoordinateCentre = centreB->getFeatureItem(l); + } + + distanceB += (centroidCoordinatePoint - centroidCoordinateCentre) * + (centroidCoordinatePoint - centroidCoordinateCentre); + } + // add the cost of the closest centre to the sum + if (distanceA < distanceB) { + sum += distanceA * node->points[i]->getWeight(); + } else { + sum += distanceB * node->points[i]->getWeight(); + } + } + // return the total cost + return sum; } /** -splits the parent node and creates two child nodes (one with the old centre and one with the new -one) +splits the parent node and creates two child nodes (one with the old centre and +one with the new one) **/ -void SESAME::LandmarkWindow::CoresetTree::split(TreeNodePtr parent, PointPtr newCentre, - int newCentreIndex) -{ - // loop counter variable - int i = 0; - - // 1. Counts how many points belong to the new and how many points belong to the old centre - int nOld = 0; - int nNew = 0; - for (i = 0; i < parent->n; i++) - { - PointPtr centre = determineClosestCentre(parent->points[i], parent->centre, newCentre); - if (centre->getIndex() == newCentre->getIndex()) - { - nNew++; - } - else - { - nOld++; - } +void SESAME::LandmarkWindow::CoresetTree::split(TreeNodePtr parent, + PointPtr newCentre, + int newCentreIndex) { + // loop counter variable + int i = 0; + + // 1. Counts how many points belong to the new and how many points belong to + // the old centre + int nOld = 0; + int nNew = 0; + for (i = 0; i < parent->n; i++) { + PointPtr centre = + determineClosestCentre(parent->points[i], parent->centre, newCentre); + if (centre->getIndex() == newCentre->getIndex()) { + nNew++; + } else { + nOld++; } - - // 2. initalizes the arrays for the pointer - // array for pointer on the points belonging to the old centre - std::vector oldPoints; // = new Point[nOld]; - // for(i = 0; i < nOld; i++) { - // - // } - // array for pointer on the points belonging to the new centre - std::vector newPoints; //= new Point[nNew]; - - int indexOld = 0; - int indexNew = 0; - - for (i = 0; i < parent->n; i++) - { - PointPtr centre = determineClosestCentre(parent->points[i], parent->centre, newCentre); - if (centre->getIndex() == newCentre->getIndex()) - { - newPoints.push_back(parent->points[i]); - newPoints[indexNew]->setClusteringCenter(newCentreIndex); - indexNew++; - } - else if (centre->getIndex() == parent->centre->getIndex()) - { - oldPoints.push_back(parent->points[i]); - indexOld++; - } - else - { - SESAME_INFO("ERROR !!! NO CENTER NEAREST !! \n"); - } - } - - // left child: old centre - // struct TreeNode *lc = (struct TreeNode *) malloc(sizeof(struct TreeNode)); - TreeNodePtr lc = DataStructureFactory::createTreeNode(); - lc->centre = parent->centre; - lc->points = oldPoints; - lc->n = nOld; - - lc->lc = NULL; - lc->rc = NULL; - lc->parent = parent; - - treeNodeTargetFunctionValue(lc); - - // right child: new centre - // struct TreeNode *rc = (struct TreeNode *) malloc(sizeof(struct TreeNode)); - TreeNodePtr rc = DataStructureFactory::createTreeNode(); - rc->centre = newCentre; - rc->points = newPoints; - rc->n = nNew; - - rc->lc = NULL; - rc->rc = NULL; - rc->parent = parent; - - treeNodeTargetFunctionValue(rc); - - // set childs of the parent node - parent->lc = lc; - parent->rc = rc; - - // propagate the cost changes to the parent nodes - while (parent != NULL) - { - parent->cost = parent->lc->cost + parent->rc->cost; - parent = parent->parent; + } + + // 2. initalizes the arrays for the pointer + // array for pointer on the points belonging to the old centre + std::vector oldPoints; // = new Point[nOld]; + // for(i = 0; i < nOld; i++) { + // + // } + // array for pointer on the points belonging to the new centre + std::vector newPoints; //= new Point[nNew]; + + int indexOld = 0; + int indexNew = 0; + + for (i = 0; i < parent->n; i++) { + PointPtr centre = + determineClosestCentre(parent->points[i], parent->centre, newCentre); + if (centre->getIndex() == newCentre->getIndex()) { + newPoints.push_back(parent->points[i]); + newPoints[indexNew]->setClusteringCenter(newCentreIndex); + indexNew++; + } else if (centre->getIndex() == parent->centre->getIndex()) { + oldPoints.push_back(parent->points[i]); + indexOld++; + } else { + SESAME_INFO("ERROR !!! NO CENTER NEAREST !! \n"); } + } + + // left child: old centre + // struct TreeNode *lc = (struct TreeNode *) malloc(sizeof(struct TreeNode)); + TreeNodePtr lc = DataStructureFactory::createTreeNode(); + lc->centre = parent->centre; + lc->points = oldPoints; + lc->n = nOld; + + lc->lc = NULL; + lc->rc = NULL; + lc->parent = parent; + + treeNodeTargetFunctionValue(lc); + + // right child: new centre + // struct TreeNode *rc = (struct TreeNode *) malloc(sizeof(struct TreeNode)); + TreeNodePtr rc = DataStructureFactory::createTreeNode(); + rc->centre = newCentre; + rc->points = newPoints; + rc->n = nNew; + + rc->lc = NULL; + rc->rc = NULL; + rc->parent = parent; + + treeNodeTargetFunctionValue(rc); + + // set childs of the parent node + parent->lc = lc; + parent->rc = rc; + + // propagate the cost changes to the parent nodes + while (parent != NULL) { + parent->cost = parent->lc->cost + parent->rc->cost; + parent = parent->parent; + } } /** @@ -685,80 +574,63 @@ void SESAME::LandmarkWindow::CoresetTree::split(TreeNodePtr parent, PointPtr new * @param centreB * @return */ -SESAME::PointPtr SESAME::LandmarkWindow::CoresetTree::determineClosestCentre(PointPtr point, - PointPtr centreA, - PointPtr centreB) -{ - // loop counter variable - int l; - - // stores the distance between p and centreA - double distanceA = 0.0; - - for (l = 0; l < point->getDimension(); l++) - { - // centroid coordinate of the point - double centroidCoordinatePoint; - if (point->getWeight() != 0.0) - { - centroidCoordinatePoint = point->getFeatureItem(l) / point->getWeight(); - } - else - { - centroidCoordinatePoint = point->getFeatureItem(l); - } - // centroid coordinate of the centre - double centroidCoordinateCentre; - if (centreA->getWeight() != 0.0) - { - centroidCoordinateCentre = centreA->getFeatureItem(l) / centreA->getWeight(); - } - else - { - centroidCoordinateCentre = centreA->getFeatureItem(l); - } - - distanceA += (centroidCoordinatePoint - centroidCoordinateCentre) * - (centroidCoordinatePoint - centroidCoordinateCentre); +SESAME::PointPtr SESAME::LandmarkWindow::CoresetTree::determineClosestCentre( + PointPtr point, PointPtr centreA, PointPtr centreB) { + // loop counter variable + int l; + + // stores the distance between p and centreA + double distanceA = 0.0; + + for (l = 0; l < point->getDimension(); l++) { + // centroid coordinate of the point + double centroidCoordinatePoint; + if (point->getWeight() != 0.0) { + centroidCoordinatePoint = point->getFeatureItem(l) / point->getWeight(); + } else { + centroidCoordinatePoint = point->getFeatureItem(l); + } + // centroid coordinate of the centre + double centroidCoordinateCentre; + if (centreA->getWeight() != 0.0) { + centroidCoordinateCentre = + centreA->getFeatureItem(l) / centreA->getWeight(); + } else { + centroidCoordinateCentre = centreA->getFeatureItem(l); } - // stores the distance between p and centreB - double distanceB = 0.0; + distanceA += (centroidCoordinatePoint - centroidCoordinateCentre) * + (centroidCoordinatePoint - centroidCoordinateCentre); + } - for (l = 0; l < point->getDimension(); l++) - { - // centroid coordinate of the point - double centroidCoordinatePoint; - if (point->getWeight() != 0.0) - { - centroidCoordinatePoint = point->getFeatureItem(l) / point->getWeight(); - } - else - { - centroidCoordinatePoint = point->getFeatureItem(l); - } - // centroid coordinate of the centre - double centroidCoordinateCentre; - if (centreB->getWeight() != 0.0) - { - centroidCoordinateCentre = centreB->getFeatureItem(l) / centreB->getWeight(); - } - else - { - centroidCoordinateCentre = centreB->getFeatureItem(l); - } - - distanceB += (centroidCoordinatePoint - centroidCoordinateCentre) * - (centroidCoordinatePoint - centroidCoordinateCentre); - } + // stores the distance between p and centreB + double distanceB = 0.0; - // return the nearest centre - if (distanceA < distanceB) - { - return centreA; + for (l = 0; l < point->getDimension(); l++) { + // centroid coordinate of the point + double centroidCoordinatePoint; + if (point->getWeight() != 0.0) { + centroidCoordinatePoint = point->getFeatureItem(l) / point->getWeight(); + } else { + centroidCoordinatePoint = point->getFeatureItem(l); } - else - { - return centreB; + // centroid coordinate of the centre + double centroidCoordinateCentre; + if (centreB->getWeight() != 0.0) { + centroidCoordinateCentre = + centreB->getFeatureItem(l) / centreB->getWeight(); + } else { + centroidCoordinateCentre = centreB->getFeatureItem(l); } + + distanceB += (centroidCoordinatePoint - centroidCoordinateCentre) * + (centroidCoordinatePoint - centroidCoordinateCentre); + } + + // return the nearest centre + if (distanceA < distanceB) { + return centreA; + } else { + return centreB; + } } \ No newline at end of file diff --git a/src/Algorithm/WindowModel/DampedWindow.cpp b/src/Algorithm/WindowModel/DampedWindow.cpp index 42245b3b..d99f77ae 100644 --- a/src/Algorithm/WindowModel/DampedWindow.cpp +++ b/src/Algorithm/WindowModel/DampedWindow.cpp @@ -3,21 +3,21 @@ // #include #include -SESAME::DampedWindow::DampedWindow(double base, double lambda) -{ - this->base = base; - this->lambda = lambda; +SESAME::DampedWindow::DampedWindow(double base, double lambda) { + this->base = base; + this->lambda = lambda; } -double SESAME::DampedWindow::decayFunction(timespec startTime, timespec currentTimestamp) const -{ - long elapsedTime = - (((currentTimestamp).tv_sec * 1000000L + (currentTimestamp).tv_nsec / 1000L) - - ((startTime).tv_sec * 1000000L + (startTime).tv_nsec / 1000L)); - return pow(this->base, -1 * this->lambda * elapsedTime); +double SESAME::DampedWindow::decayFunction(timespec startTime, + timespec currentTimestamp) const { + long elapsedTime = + (((currentTimestamp).tv_sec * 1000000L + + (currentTimestamp).tv_nsec / 1000L) - + ((startTime).tv_sec * 1000000L + (startTime).tv_nsec / 1000L)); + return pow(this->base, -1 * this->lambda * elapsedTime); } -double SESAME::DampedWindow::decayFunction(int startTime, int currentTimestamp) const -{ - int elapsedTime = currentTimestamp - startTime; - return pow(this->base, -1 * this->lambda * elapsedTime); +double SESAME::DampedWindow::decayFunction(int startTime, + int currentTimestamp) const { + int elapsedTime = currentTimestamp - startTime; + return pow(this->base, -1 * this->lambda * elapsedTime); } \ No newline at end of file diff --git a/src/Algorithm/WindowModel/LandmarkWindow.cpp b/src/Algorithm/WindowModel/LandmarkWindow.cpp index eb1797f0..e3b3cde5 100644 --- a/src/Algorithm/WindowModel/LandmarkWindow.cpp +++ b/src/Algorithm/WindowModel/LandmarkWindow.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 20/07/2021. @@ -16,144 +17,143 @@ * @param: dim * @param: windowSize */ -void SESAME::LandmarkWindow::initWindow(int num) -{ - int i; - for (i = 0; i < this->windowManager.numberOfWindow; i++) - { - Window blankWindow; - vector p_list; - vector s_list; - for (int j = 0; j < num; j++) - { - p_list.push_back(DataStructureFactory::createPoint(0, 0, 0, 0, 0)); - s_list.push_back(DataStructureFactory::createPoint(0, 0, 0, 0, 0)); - } - blankWindow.points = p_list; - blankWindow.spillover = s_list; - blankWindow.cursize = 0; - this->windowManager.windows.push_back(blankWindow); +void SESAME::LandmarkWindow::initWindow(int num) { + int i; + for (i = 0; i < this->windowManager.numberOfWindow; i++) { + Window blankWindow; + vector p_list; + vector s_list; + for (int j = 0; j < num; j++) { + p_list.push_back(GenericFactory::New()); + s_list.push_back(GenericFactory::New()); } + blankWindow.points = p_list; + blankWindow.spillover = s_list; + blankWindow.cursize = 0; + this->windowManager.windows.push_back(blankWindow); + } } /** - * TODO: extract the incremental computation progress from the whole function. @Wangxin - * @Description: insert every data point into the time window. If both of the current original and - * spillover windows are full, construct a coreset tree using all data and store the computed - * results tree nodes into the next original window + * TODO: extract the incremental computation progress from the whole function. + * @Wangxin + * @Description: insert every data point into the time window. If both of the + * current original and spillover windows are full, construct a coreset tree + * using all data and store the computed results tree nodes into the next + * original window * @Param: Point: the single point data * @Return: void */ -void SESAME::LandmarkWindow::insertPoint(PointPtr point) -{ - // check if there is enough space in the first window - int cursize = this->windowManager.windows[0].cursize; - if (cursize >= this->windowManager.maxWindowSize) - { - // SESAME_DEBUG("Window 0 is full"); - // start spillover process - int curWindow = 0; - int nextWindow = 1; +void SESAME::LandmarkWindow::insertPoint(PointPtr point) { + // check if there is enough space in the first window + int cursize = this->windowManager.windows[0].cursize; + if (cursize >= this->windowManager.maxWindowSize) { + // SESAME_DEBUG("Window 0 is full"); + // start spillover process + int curWindow = 0; + int nextWindow = 1; - // check if the next window is empty - if (this->windowManager.windows[nextWindow].cursize == 0) - { - timerMeter.dataInsertAccMeasure(); - // SESAME_DEBUG("Window " << nextWindow << " is not full, move window " << - // curWindow << " points to window " - // << nextWindow); + // check if the next window is empty + if (this->windowManager.windows[nextWindow].cursize == 0) { + timerMeter.dataInsertAccMeasure(); + // SESAME_DEBUG("Window " << nextWindow << " is not full, move window + // " << curWindow << " points to window " + // << nextWindow); - // if empty, copy the window - int i; - for (i = 0; i < this->windowManager.maxWindowSize; i++) - { - this->windowManager.windows[nextWindow].points[i] = - this->windowManager.windows[curWindow].points[i]->copy(); - } - // window is now full - this->windowManager.windows[nextWindow].cursize = this->windowManager.maxWindowSize; - // first window is now set empty - this->windowManager.windows[curWindow].cursize = 0; - cursize = 0; - timerMeter.dataInsertEndMeasure(); - } - else - { - timerMeter.dataInsertAccMeasure(); + // if empty, copy the window + int i; + for (i = 0; i < this->windowManager.maxWindowSize; i++) { + this->windowManager.windows[nextWindow].points[i] = + this->windowManager.windows[curWindow].points[i]->copy(); + } + // window is now full + this->windowManager.windows[nextWindow].cursize = + this->windowManager.maxWindowSize; + // first window is now set empty + this->windowManager.windows[curWindow].cursize = 0; + cursize = 0; + timerMeter.dataInsertEndMeasure(); + } else { + timerMeter.dataInsertAccMeasure(); - // if next window is full - // SESAME_DEBUG( - // "Window " << nextWindow << " is full, move window " << curWindow << " to - // spillover " << nextWindow); - // SESAME_DEBUG( - // "Window " << nextWindow << " is full"); - // copy the points in the current window to the next spillover and continue - int i; - for (i = 0; i < this->windowManager.maxWindowSize; i++) - { - this->windowManager.windows[nextWindow].spillover[i] = - this->windowManager.windows[curWindow].points[i]->copy(); - } - this->windowManager.windows[0].cursize = 0; - cursize = 0; - curWindow++; - nextWindow++; - timerMeter.dataInsertEndMeasure(); - // as long as the next window is full output the coreset to the next spillover, using - // points in the next window and spillover - while (this->windowManager.windows[nextWindow].cursize == - this->windowManager.maxWindowSize) - { - timerMeter.clusterUpdateAccMeasure(); + // if next window is full + // SESAME_DEBUG( + // "Window " << nextWindow << " is full, move window " << + // curWindow << " to spillover " << nextWindow); + // SESAME_DEBUG( + // "Window " << nextWindow << " is full"); + // copy the points in the current window to the next spillover and + // continue + int i; + for (i = 0; i < this->windowManager.maxWindowSize; i++) { + this->windowManager.windows[nextWindow].spillover[i] = + this->windowManager.windows[curWindow].points[i]->copy(); + } + this->windowManager.windows[0].cursize = 0; + cursize = 0; + curWindow++; + nextWindow++; + timerMeter.dataInsertEndMeasure(); + // as long as the next window is full output the coreset to the next + // spillover, using points in the next window and spillover + while (this->windowManager.windows[nextWindow].cursize == + this->windowManager.maxWindowSize) { + timerMeter.clusterUpdateAccMeasure(); - // SESAME_DEBUG("Window " << nextWindow - // << " is full, Continue! construct the coreset using - // points in window and spillover " - // << curWindow << " and store it in the spillover " << - // nextWindow); - // SESAME_DEBUG("Window " << nextWindow << " is full"); - this->tree->unionTreeCoreset(this->windowManager.maxWindowSize, - this->windowManager.maxWindowSize, - this->windowManager.maxWindowSize, - this->windowManager.windows[curWindow].points, - this->windowManager.windows[curWindow].spillover, - this->windowManager.windows[nextWindow].spillover); - // here we store the m constructed coreset points into the next spillover - // current window now empty - this->windowManager.windows[curWindow].cursize = 0; - curWindow++; - nextWindow++; - timerMeter.clusterUpdateEndMeasure(); - } - timerMeter.clusterUpdateAccMeasure(); + // SESAME_DEBUG("Window " << nextWindow + // << " is full, Continue! construct the coreset + // using points in window and spillover " + // << curWindow << " and store it in the + // spillover " << nextWindow); + // SESAME_DEBUG("Window " << nextWindow << " is full"); + this->tree->unionTreeCoreset( + this->windowManager.maxWindowSize, + this->windowManager.maxWindowSize, + this->windowManager.maxWindowSize, + this->windowManager.windows[curWindow].points, + this->windowManager.windows[curWindow].spillover, + this->windowManager.windows[nextWindow].spillover); + // here we store the m constructed coreset points into the next + // spillover current window now empty + this->windowManager.windows[curWindow].cursize = 0; + curWindow++; + nextWindow++; + timerMeter.clusterUpdateEndMeasure(); + } + timerMeter.clusterUpdateAccMeasure(); - // SESAME_DEBUG("Window " << nextWindow - // << " is not full, End! construct the coreset using points in - // window and spillover " - // << curWindow << " and store the it in the last spillover"); - // if the next window is empty, just do the same operation and store the constructed - // coreset into the next spillover, now the next window is empty but next spillover is - // full - this->tree->unionTreeCoreset( - this->windowManager.maxWindowSize, this->windowManager.maxWindowSize, - this->windowManager.maxWindowSize, this->windowManager.windows[curWindow].points, - this->windowManager.windows[curWindow].spillover, - this->windowManager.windows[nextWindow].points); - this->windowManager.windows[curWindow].cursize = 0; - this->windowManager.windows[nextWindow].cursize = this->windowManager.maxWindowSize; - timerMeter.clusterUpdateEndMeasure(); - } + // SESAME_DEBUG("Window " << nextWindow + // << " is not full, End! construct the coreset + // using points in window and spillover " + // << curWindow << " and store the it in the last + // spillover"); + // if the next window is empty, just do the same operation and store the + // constructed coreset into the next spillover, now the next window is + // empty but next spillover is full + this->tree->unionTreeCoreset( + this->windowManager.maxWindowSize, this->windowManager.maxWindowSize, + this->windowManager.maxWindowSize, + this->windowManager.windows[curWindow].points, + this->windowManager.windows[curWindow].spillover, + this->windowManager.windows[nextWindow].points); + this->windowManager.windows[curWindow].cursize = 0; + this->windowManager.windows[nextWindow].cursize = + this->windowManager.maxWindowSize; + timerMeter.clusterUpdateEndMeasure(); } - timerMeter.dataInsertAccMeasure(); - // if the first window is not full, just insert point into it - this->windowManager.windows[0].points[cursize] = point->copy(); // .copy(point); - this->windowManager.windows[0].cursize++; - timerMeter.dataInsertEndMeasure(); + } + timerMeter.dataInsertAccMeasure(); + // if the first window is not full, just insert point into it + this->windowManager.windows[0].points[cursize] = + point->copy(); // .copy(point); + this->windowManager.windows[0].cursize++; + timerMeter.dataInsertEndMeasure(); } /** -It may happen that the manager is not full (since n is not always a power of 2). In this case we -extract the coreset from the manager by computing a coreset of all nonempty windows +It may happen that the manager is not full (since n is not always a power of 2). +In this case we extract the coreset from the manager by computing a coreset of +all nonempty windows Case 1: the last bucket is full => n is a power of 2 and we return the contents of the last bucket @@ -163,42 +163,39 @@ Case2: the last bucket is not full this operation should only be called after the streaming process is sourceEnd **/ -std::vector SESAME::LandmarkWindow::getCoresetFromManager( - std::vector &coreset) -{ - int i = 0; - if (this->windowManager.windows[this->windowManager.numberOfWindow - 1].cursize == - this->windowManager.maxWindowSize) - { - coreset = this->windowManager.windows[this->windowManager.numberOfWindow - 1].points; +std::vector +SESAME::LandmarkWindow::getCoresetFromManager(std::vector &coreset) { + int i = 0; + if (this->windowManager.windows[this->windowManager.numberOfWindow - 1] + .cursize == this->windowManager.maxWindowSize) { + coreset = + this->windowManager.windows[this->windowManager.numberOfWindow - 1] + .points; + } else { + // find the first nonempty bucket + for (i = 0; i < this->windowManager.numberOfWindow; i++) { + if (this->windowManager.windows[i].cursize == + this->windowManager.maxWindowSize) { + coreset = this->windowManager.windows[i].points; + break; + } } - else - { - // find the first nonempty bucket - for (i = 0; i < this->windowManager.numberOfWindow; i++) - { - if (this->windowManager.windows[i].cursize == this->windowManager.maxWindowSize) - { - coreset = this->windowManager.windows[i].points; - break; - } - } - // as long as there is a nonempty bucket compute a coreset - int j; - for (j = i + 1; j < this->windowManager.numberOfWindow; j++) - { - if (this->windowManager.windows[j].cursize != 0) - { - // output the coreset into the spillover of bucket j - this->tree->unionTreeCoreset( - this->windowManager.maxWindowSize, this->windowManager.maxWindowSize, - this->windowManager.maxWindowSize, this->windowManager.windows[j].points, - coreset, this->windowManager.windows[j].spillover); - coreset = this->windowManager.windows[j].spillover; - } - } + // as long as there is a nonempty bucket compute a coreset + int j; + for (j = i + 1; j < this->windowManager.numberOfWindow; j++) { + if (this->windowManager.windows[j].cursize != 0) { + // output the coreset into the spillover of bucket j + this->tree->unionTreeCoreset(this->windowManager.maxWindowSize, + this->windowManager.maxWindowSize, + this->windowManager.maxWindowSize, + this->windowManager.windows[j].points, + coreset, + this->windowManager.windows[j].spillover); + coreset = this->windowManager.windows[j].spillover; + } } - return coreset; + } + return coreset; } /** @@ -207,15 +204,13 @@ std::vector SESAME::LandmarkWindow::getCoresetFromManager( * time_interval: the time interval of online pyramidal window frame * @Return: void */ -void SESAME::LandmarkWindow::initPyramidalWindow(unsigned int time_interval) -{ - QueueSnapshotPtr queueSnapshot; - queueSnapshot.reserve(time_interval + 1); - for (int i = 0; i < 50; i++) - { - orderSnapShots.push_back(queueSnapshot); - } - this->pyramidalWindow.currentOrder = 0; +void SESAME::LandmarkWindow::initPyramidalWindow(unsigned int time_interval) { + QueueSnapshotPtr queueSnapshot; + queueSnapshot.reserve(time_interval + 1); + for (int i = 0; i < 50; i++) { + orderSnapShots.push_back(queueSnapshot); + } + this->pyramidalWindow.currentOrder = 0; } /** @@ -226,57 +221,54 @@ void SESAME::LandmarkWindow::initPyramidalWindow(unsigned int time_interval) * @Return: void */ // TODO Still need to debug -void SESAME::LandmarkWindow::pyramidalWindowProcess(int elapsedTime, - const SESAME::MicroClusters µClusters) -{ - int i = -1; - if (elapsedTime > 0) +void SESAME::LandmarkWindow::pyramidalWindowProcess( + int elapsedTime, const SESAME::MicroClusters µClusters) { + int i = -1; + if (elapsedTime > 0) { + this->pyramidalWindow.currentOrder = + (int)(log(elapsedTime) / log(this->pyramidalWindow.time_interval)); + // NOTE: snapshot when elapsed time =0 always add to the front of latest T + // order + while (++i >= 0) //++i>=0 { - this->pyramidalWindow.currentOrder = - (int)(log(elapsedTime) / log(this->pyramidalWindow.time_interval)); - // NOTE: snapshot when elapsed time =0 always add to the front of latest T order - while (++i >= 0) //++i>=0 - { - if (this->pyramidalWindow.currentOrder >= i && - elapsedTime % (int)(pow(this->pyramidalWindow.time_interval, i)) == 0) - { - if (elapsedTime % (int)(pow(this->pyramidalWindow.time_interval, i + 1)) != 0) - { - storeSnapshot(i, microClusters, elapsedTime); - } - } - else - break; + if (this->pyramidalWindow.currentOrder >= i && + elapsedTime % (int)(pow(this->pyramidalWindow.time_interval, i)) == + 0) { + if (elapsedTime % + (int)(pow(this->pyramidalWindow.time_interval, i + 1)) != + 0) { + storeSnapshot(i, microClusters, elapsedTime); } + } else + break; } - else - storeSnapshot(0, microClusters, 0); + } else + storeSnapshot(0, microClusters, 0); } /** - * @Description: this function stores snapshots into the pyramidal window data structure + * @Description: this function stores snapshots into the pyramidal window data + * structure * @Param: currentOrder: the ith order snapshots stored into * microClusters: micro clusters' snapshot need to be stored * elapsedTime: the current elapsed time * @Return: void */ void SESAME::LandmarkWindow::storeSnapshot(unsigned int currentOrder, - const MicroClusters µClusters, int elapsedTime) -{ - // SESAME_INFO("taking snapshot"); + const MicroClusters µClusters, + int elapsedTime) { + // SESAME_INFO("taking snapshot"); - unsigned int size = orderSnapShots[currentOrder].size(); - SnapshotPtr snapshot; - snapshot = DataStructureFactory::createSnapshot( - const_cast &>(microClusters), elapsedTime); - // SESAME_INFO("The current order size is "<pyramidalWindow.time_interval + 1) - { - orderSnapShots[currentOrder].erase(orderSnapShots[currentOrder].begin()); - } - orderSnapShots[currentOrder].push_back(snapshot); + unsigned int size = orderSnapShots[currentOrder].size(); + SnapshotPtr snapshot; + snapshot = DataStructureFactory::createSnapshot( + const_cast &>(microClusters), elapsedTime); + // SESAME_INFO("The current order size is "<pyramidalWindow.time_interval + 1) { + orderSnapShots[currentOrder].erase(orderSnapShots[currentOrder].begin()); + } + orderSnapShots[currentOrder].push_back(snapshot); } -void SESAME::LandmarkWindow::clearPyramidalWindow() -{ - std::vector().swap(this->orderSnapShots); +void SESAME::LandmarkWindow::clearPyramidalWindow() { + std::vector().swap(this->orderSnapShots); } diff --git a/src/Algorithm/WindowModel/WindowFactory.cpp b/src/Algorithm/WindowModel/WindowFactory.cpp index cfe03bdf..5f0f6106 100644 --- a/src/Algorithm/WindowModel/WindowFactory.cpp +++ b/src/Algorithm/WindowModel/WindowFactory.cpp @@ -1,16 +1,16 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 27/07/2021. // #include -std::shared_ptr SESAME::WindowFactory::createLandmarkWindow() -{ - return std::make_shared(); +std::shared_ptr +SESAME::WindowFactory::createLandmarkWindow() { + return std::make_shared(); } -std::shared_ptr SESAME::WindowFactory::createDampedWindow(double base, - double lambda) -{ - return std::make_shared(base, lambda); +std::shared_ptr +SESAME::WindowFactory::createDampedWindow(double base, double lambda) { + return std::make_shared(base, lambda); } diff --git a/src/Algorithm/WindowModel/WindowModel.cpp b/src/Algorithm/WindowModel/WindowModel.cpp index 85a64deb..1cd62957 100644 --- a/src/Algorithm/WindowModel/WindowModel.cpp +++ b/src/Algorithm/WindowModel/WindowModel.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 26/07/2021. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2d8729a1..2ad1b56e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,4 @@ add_subdirectory(Algorithm) -add_subdirectory(APIs) add_subdirectory(Engine) add_subdirectory(Sources) add_subdirectory(Sinks) diff --git a/src/Engine/Engine.cpp b/src/Engine/Engine.cpp index 5b19d00a..87cb0d99 100644 --- a/src/Engine/Engine.cpp +++ b/src/Engine/Engine.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 19/07/2021. diff --git a/src/Engine/SimpleEngine.cpp b/src/Engine/SimpleEngine.cpp index 9250f30d..d4e16f72 100644 --- a/src/Engine/SimpleEngine.cpp +++ b/src/Engine/SimpleEngine.cpp @@ -1,4 +1,5 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 19/07/2021. @@ -8,10 +9,10 @@ #include "Utils/Logger.hpp" #include "Utils/SPSCQueue.hpp" -#include +#include #ifdef GPERF -# include +#include #endif #include @@ -23,38 +24,35 @@ using namespace std; SESAME::SimpleEngine::SimpleEngine(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, AlgorithmPtr algoPtr) - : threadID(0) -{ - this->sourcePtr = std::move(sourcePtr); - this->sinkPtr = std::move(sinkPtr); - this->algoPtr = std::move(algoPtr); - threadPtr = std::make_shared(); + : threadID(0) { + this->sourcePtr = std::move(sourcePtr); + this->sinkPtr = std::move(sinkPtr); + this->algoPtr = std::move(algoPtr); + threadPtr = std::make_shared(); } -void SESAME::SimpleEngine::run() -{ - barrierPtr = UtilityFunctions::createBarrier(3); - this->sourcePtr->setBarrier(barrierPtr); - this->sinkPtr->setBarrier(barrierPtr); +void SESAME::SimpleEngine::run() { + barrierPtr = UtilityFunctions::createBarrier(3); + this->sourcePtr->setBarrier(barrierPtr); + this->sinkPtr->setBarrier(barrierPtr); - // start source thread - this->sourcePtr->start(assignID()); + // start source thread + this->sourcePtr->start(assignID()); - // start engine thread(s) for algorithm. - this->start(sourcePtr, sinkPtr, algoPtr, assignID()); + // start engine thread(s) for algorithm. + this->start(sourcePtr, sinkPtr, algoPtr, assignID()); - // start sink thread - this->sinkPtr->start(assignID()); + // start sink thread + this->sinkPtr->start(assignID()); } -bool SESAME::SimpleEngine::start(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, AlgorithmPtr algoPtr, - int id) -{ - auto fun = [this, sourcePtr, sinkPtr, algoPtr]() { - runningRoutine(sourcePtr, sinkPtr, algoPtr); - }; - threadPtr->construct(fun, id); - SESAME_DEBUG("Engine spawn thread=" << threadPtr->getID()); - return true; +bool SESAME::SimpleEngine::start(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, + AlgorithmPtr algoPtr, int id) { + auto fun = [this, sourcePtr, sinkPtr, algoPtr]() { + runningRoutine(sourcePtr, sinkPtr, algoPtr); + }; + threadPtr->construct(fun, id); + SESAME_DEBUG("Engine spawn thread=" << threadPtr->getID()); + return true; } /** * TODO: this is not generic enough to capture different algorithms. @@ -62,95 +60,92 @@ bool SESAME::SimpleEngine::start(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, A * @param sinkPtr * @param algoPtr */ -void SESAME::SimpleEngine::runningRoutine(DataSourcePtr sourcePtr, DataSinkPtr sinkPtr, - AlgorithmPtr algoPtr) -{ - barrierPtr->arrive_and_wait(); // wait for source and sink. - SESAME_INFO("Algorithm start to process data"); - overallMeter.START_MEASURE(); - overallMeter.overallStartMeasure(); - // We set observing interval for cumulative time: every 100 tuples - overallMeter.setInterval(100); - // initialization - - algoPtr->Init(); - - boost::progress_display show_progress(algoPtr->param.num_points, std::cerr, - "Online Clustering:\n"); +void SESAME::SimpleEngine::runningRoutine(DataSourcePtr sourcePtr, + DataSinkPtr sinkPtr, + AlgorithmPtr algoPtr) { + barrierPtr->arrive_and_wait(); // wait for source and sink. + SESAME_INFO("Algorithm start to process data"); + overallMeter.START_MEASURE(); + overallMeter.overallStartMeasure(); + // We set observing interval for cumulative time: every 100 tuples + overallMeter.setInterval(100); + // initialization + + algoPtr->Init(); + + boost::timer::progress_display show_progress( + algoPtr->param.num_points, std::cerr, "Online Clustering:\n"); #ifdef GPERF - std::string prof = "/tmp/" + algoPtr->param.Workload() + "." + algoPtr->param.Name() + ".prof"; - ProfilerStart(prof.c_str()); + std::string prof = "/tmp/" + algoPtr->param.Workload() + "." + + algoPtr->param.Name() + ".prof"; + ProfilerStart(prof.c_str()); #endif - // run online clustering - while (!sourcePtr->sourceEnded()) - { // continuously processing infinite incoming data streams. - if (!sourcePtr->empty()) - { - auto item = sourcePtr->get(); - overallMeter.onlineAccMeasure(); - algoPtr->RunOnline(item->copy()); - algoPtr->Count(); - ++show_progress; - overallMeter.onlineAccEMeasure(); - } + // run online clustering + while (!sourcePtr->sourceEnded()) { // continuously processing infinite + // incoming data streams. + if (!sourcePtr->empty()) { + auto item = sourcePtr->get(); + overallMeter.onlineAccMeasure(); + algoPtr->RunOnline(item->copy()); + algoPtr->Count(); + ++show_progress; + overallMeter.onlineAccEMeasure(); } - while (!sourcePtr->empty()) - { // process the remaining data streams after source stops. - auto item = sourcePtr->get(); - overallMeter.onlineAccMeasure(); - algoPtr->RunOnline(item->copy()); - algoPtr->Count(); - ++show_progress; - overallMeter.onlineAccEMeasure(); - } - overallMeter.onlineEndMeasure(); - - SESAME_INFO("ready to offline clustering"); - - // run offline clustering - overallMeter.refinementStartMeasure(); - algoPtr->RunOffline(sinkPtr); - SESAME_INFO("Engine sourceEnd process data"); - overallMeter.refinementEndMeasure(); + } + while (!sourcePtr->empty()) { // process the remaining data streams after + // source stops. + auto item = sourcePtr->get(); + overallMeter.onlineAccMeasure(); + algoPtr->RunOnline(item->copy()); + algoPtr->Count(); + ++show_progress; + overallMeter.onlineAccEMeasure(); + } + overallMeter.onlineEndMeasure(); + + SESAME_INFO("ready to offline clustering"); + + // run offline clustering + overallMeter.refinementStartMeasure(); + algoPtr->RunOffline(sinkPtr); + SESAME_INFO("Engine sourceEnd process data"); + overallMeter.refinementEndMeasure(); #ifdef GPERF - ProfilerStop(); + ProfilerStop(); #endif - overallMeter.overallEndMeasure(); - overallMeter.END_MEASURE(); - // TODO Add break down output + overallMeter.overallEndMeasure(); + overallMeter.END_MEASURE(); + // TODO Add break down output - sinkPtr->Ended(); // Let sink knows that there won't be any more data coming. - SESAME_INFO("Engine sourceEnd emit data"); - barrierPtr->arrive_and_wait(); // wait for source and sink. - SESAME_DEBUG("Engine sourceEnd wait for source and sink."); + sinkPtr->Ended(); // Let sink knows that there won't be any more data coming. + SESAME_INFO("Engine sourceEnd emit data"); + barrierPtr->arrive_and_wait(); // wait for source and sink. + SESAME_DEBUG("Engine sourceEnd wait for source and sink."); } -bool SESAME::SimpleEngine::stop() -{ - if (threadPtr) - { - SESAME_DEBUG("Engine::stop try to join threads=" << threadPtr->getID()); - threadPtr->join(); - threadPtr.reset(); - } - else - { - SESAME_DEBUG("Engine " << threadPtr->getID() << ": Thread is not joinable"); - return false; - } - return true; +bool SESAME::SimpleEngine::stop() { + if (threadPtr) { + SESAME_DEBUG("Engine::stop try to join threads=" << threadPtr->getID()); + threadPtr->join(); + threadPtr.reset(); + } else { + SESAME_DEBUG("Engine " << threadPtr->getID() << ": Thread is not joinable"); + return false; + } + return true; } int SESAME::SimpleEngine::assignID() { return threadID++; } -void SESAME::SimpleEngine::printTime() -{ - SESAME_INFO("Engine takes " << overallMeter.MeterUSEC() << " useconds to finish."); - std::cout << "Engine takes " << overallMeter.MeterUSEC() << " useconds to finish." << std::endl; - std::cout << "Online Time: " << overallMeter.MeterOnlineUSEC() << "\n" - << "Refinement Time: " << overallMeter.MeterRefinementUSEC() << "\n" - << "Overall Time: " << overallMeter.MeterOverallUSEC() << "\n" - << std::endl; +void SESAME::SimpleEngine::printTime() { + SESAME_INFO("Engine takes " << overallMeter.MeterUSEC() + << " useconds to finish."); + std::cout << "Engine takes " << overallMeter.MeterUSEC() + << " useconds to finish." << std::endl; + std::cout << "Online Time: " << overallMeter.MeterOnlineUSEC() << "\n" + << "Refinement Time: " << overallMeter.MeterRefinementUSEC() << "\n" + << "Overall Time: " << overallMeter.MeterOverallUSEC() << "\n" + << std::endl; } diff --git a/src/Engine/SingleThread.cpp b/src/Engine/SingleThread.cpp index da682da5..63307b25 100644 --- a/src/Engine/SingleThread.cpp +++ b/src/Engine/SingleThread.cpp @@ -1,14 +1,14 @@ -// Copyright (C) 2021 by the IntelliStream team (https://github.com/intellistream) +// Copyright (C) 2021 by the IntelliStream team +// (https://github.com/intellistream) // // Created by Shuhao Zhang on 20/8/21. // #include -void SESAME::SingleThread::construct(std::function fun, int id) -{ - this->ThreadPtr = std::make_shared(fun); - this->setID(id); +void SESAME::SingleThread::construct(std::function fun, int id) { + this->ThreadPtr = std::make_shared(fun); + this->setID(id); } int SESAME::SingleThread::setID(int id) { return this->id = id; } int SESAME::SingleThread::getID() { return this->id; } diff --git a/src/Evaluation/CMM.cpp b/src/Evaluation/CMM.cpp index 41a5bfd4..c292aa1e 100644 --- a/src/Evaluation/CMM.cpp +++ b/src/Evaluation/CMM.cpp @@ -8,68 +8,60 @@ #include "Utils/Logger.hpp" #include "Utils/UtilityFunctions.hpp" -#include #include #include #include #include +#include #include #include -namespace SESAME -{ +namespace SESAME { /** * @Description: CMMPoint */ -CMMPoint::CMMPoint(int id, long startTime, long time, std::vector &vec, double a, - double lambda, int truth) -{ - this->id = id; - this->startTime = startTime; - this->dim = (int)vec.size(); - this->vec = vec; - this->weight = pow(a, lambda * (time - startTime)); - this->truth = truth; +CMMPoint::CMMPoint(int id, long startTime, long time, std::vector &vec, + double a, double lambda, int truth) { + this->id = id; + this->startTime = startTime; + this->dim = (int)vec.size(); + this->vec = vec; + this->weight = pow(a, lambda * (time - startTime)); + this->truth = truth; } -double CMMPoint::getDisTo(CMMPointPtr &p) -{ - double dis = 0; - double temp = 0; - for (int i = 0; i < dim; i++) - { - temp = p->vec[i] - vec[i]; - dis += temp * temp; - } - return sqrt(dis); +double CMMPoint::getDisTo(CMMPointPtr &p) { + double dis = 0; + double temp = 0; + for (int i = 0; i < dim; i++) { + temp = p->vec[i] - vec[i]; + dis += temp * temp; + } + return sqrt(dis); } -double CMMPoint::knnDis(int k, CMMCluster &c) -{ - std::vector list = c.points; - int size = c.points.size(); - std::vector diss(size); - for (int i = 0; i < size; i++) - { - CMMPointPtr p = list.at(i); - if (list.at(i)->id != this->id) - { - diss[i] = getDisTo(list.at(i)); - } +double CMMPoint::knnDis(int k, CMMCluster &c) { + std::vector list = c.points; + int size = c.points.size(); + std::vector diss(size); + for (int i = 0; i < size; i++) { + CMMPointPtr p = list.at(i); + if (list.at(i)->id != this->id) { + diss[i] = getDisTo(list.at(i)); } - sort(diss.begin(), diss.end()); - double sum = 0; - int num = 0; - for (int i = 0; i < k && i < size; i++) - { - sum += diss[i]; - num++; - } - if (sum == 0) - return 1; - else - return sum / num; + } + sort(diss.begin(), diss.end()); + double sum = 0; + int num = 0; + for (int i = 0; i < k && i < size; i++) { + sum += diss[i]; + num++; + } + if (sum == 0) + return 1; + else + return sum / num; } /** @@ -79,213 +71,168 @@ CMMCluster::CMMCluster() {} void CMMCluster::add(CMMPointPtr &p) { this->points.push_back(p); } -void CMMCluster::getConn() -{ - int size = (int)points.size(); - std::vector knhPDis(size); - double sum = 0; - for (int i = 0; i < size; i++) - { - CMMPointPtr p = points.at(i); - knhPDis[i] = p->knnDis(CMM_KNN, *this); - // the paper says that the value of k has only marginal influence on CMM - // effectiveness. so here we set k = 10 - sum += knhPDis[i]; - } - knhDis = sum / size; - for (int i = 0; i < size; i++) - { - if (type == GTCluster) - { - if (knhPDis[i] < knhDis) - { - points.at(i)->conCL = 1; - } - else - { - if (points.size() == 1) - { - points.at(i)->conCL = 1; - } - else - { - points.at(i)->conCL = knhDis / knhPDis[i]; - } - } - } - else - { - if (knhPDis[i] < knhDis) - { - points.at(i)->con = 1; - } - else - { - points.at(i)->con = knhDis / knhPDis[i]; - } +void CMMCluster::getConn() { + int size = (int)points.size(); + std::vector knhPDis(size); + double sum = 0; + for (int i = 0; i < size; i++) { + CMMPointPtr p = points.at(i); + knhPDis[i] = p->knnDis(CMM_KNN, *this); + // the paper says that the value of k has only marginal influence on CMM + // effectiveness. so here we set k = 10 + sum += knhPDis[i]; + } + knhDis = sum / size; + for (int i = 0; i < size; i++) { + if (type == GTCluster) { + if (knhPDis[i] < knhDis) { + points.at(i)->conCL = 1; + } else { + if (points.size() == 1) { + points.at(i)->conCL = 1; + } else { + points.at(i)->conCL = knhDis / knhPDis[i]; } + } + } else { + if (knhPDis[i] < knhDis) { + points.at(i)->con = 1; + } else { + points.at(i)->con = knhDis / knhPDis[i]; + } } + } } -CMMDriver::CMMDriver(int dim, double a, double lambda) -{ - this->dim = dim; - this->a = a; - this->lambda = lambda; - // this->k = k; +CMMDriver::CMMDriver(int dim, double a, double lambda) { + this->dim = dim; + this->a = a; + this->lambda = lambda; + // this->k = k; } -void CMMDriver::load(const std::vector &inputs, const std::vector &predicts, - int dim, double time) -{ - // time ? weight ? - // convert to the predicted clustering center index - for (int i = 0; i < inputs.size(); i++) - { - std::vector features; - for (int j = 0; j < dim; j++) - { - features.push_back(inputs.at(i)->getFeatureItem(j)); - } - int cl = inputs.at(i)->getClusteringCenter(); - assert(cl != -1); - CMMPointPtr p = - std::make_shared(predicts.at(i)->getIndex(), (long)predicts.at(i)->getIndex(), - time, features, a, lambda, cl); - if (CL.count(cl)) - { - CL[cl]->add(p); - } - else - { - CMMClusterPtr c = std::make_shared(); - c->groundTruth = cl; - c->type = GTCluster; - c->add(p); - CL.insert(std::pair(cl, c)); - CLlist.push_back(c); - } - int ci = predicts.at(i)->getClusteringCenter(); - if (C.count(ci)) - { - C[ci]->add(p); - } - else - { - CMMClusterPtr c = std::make_shared(); - ; - c->groundTruth = ci; - c->type = Cluster; - c->add(p); - C.insert(std::pair(ci, c)); - Clist.push_back(c); - } +void CMMDriver::load(const std::vector &inputs, + const std::vector &predicts, int dim, + double time) { + // time ? weight ? + // convert to the predicted clustering center index + for (int i = 0; i < inputs.size(); i++) { + std::vector features; + for (int j = 0; j < dim; j++) { + features.push_back(inputs.at(i)->getFeatureItem(j)); + } + int cl = inputs.at(i)->getClusteringCenter(); + assert(cl != -1); + CMMPointPtr p = std::make_shared(predicts.at(i)->getIndex(), + (long)predicts.at(i)->getIndex(), + time, features, a, lambda, cl); + if (CL.count(cl)) { + CL[cl]->add(p); + } else { + CMMClusterPtr c = std::make_shared(); + c->groundTruth = cl; + c->type = GTCluster; + c->add(p); + CL.insert(std::pair(cl, c)); + CLlist.push_back(c); + } + int ci = predicts.at(i)->getClusteringCenter(); + if (C.count(ci)) { + C[ci]->add(p); + } else { + CMMClusterPtr c = std::make_shared(); + ; + c->groundTruth = ci; + c->type = Cluster; + c->add(p); + C.insert(std::pair(ci, c)); + Clist.push_back(c); } + } } -void CMMDriver::voteMap() -{ - /* - here CMM uses the mapping strategy rather than the vote, and in the paper - the author also says that Mapping clusters based on majority voting cannot - recognize emerging or disappearing clusters. - */ - int csize = (int)Clist.size(); - for (int i = 0; i < csize; i++) - { - CMMClusterPtr c = Clist.at(i); - std::unordered_map map; - std::vector list = c->points; - int psize = (int)list.size(); - for (int j = 0; j < psize; j++) - { - CMMPointPtr p = list.at(j); - int truth = p->truth; - if (map.count(truth)) - { - map.insert(std::pair(truth, map.at(truth) + p->weight)); - } - else - { - map.insert(std::pair(truth, p->weight)); - } - } - int label = 1; - double max = 0; - for (auto &m : map) - { - if (m.second > max) - { - max = m.second; - label = m.first; - assert(label != -1); - } - } - c->groundTruth = label; +void CMMDriver::voteMap() { + /* + here CMM uses the mapping strategy rather than the vote, and in the paper + the author also says that Mapping clusters based on majority voting cannot + recognize emerging or disappearing clusters. + */ + int csize = (int)Clist.size(); + for (int i = 0; i < csize; i++) { + CMMClusterPtr c = Clist.at(i); + std::unordered_map map; + std::vector list = c->points; + int psize = (int)list.size(); + for (int j = 0; j < psize; j++) { + CMMPointPtr p = list.at(j); + int truth = p->truth; + if (map.count(truth)) { + map.insert(std::pair(truth, map.at(truth) + p->weight)); + } else { + map.insert(std::pair(truth, p->weight)); + } } + int label = 1; + double max = 0; + for (auto &m : map) { + if (m.second > max) { + max = m.second; + label = m.first; + assert(label != -1); + } + } + c->groundTruth = label; + } } -void CMMDriver::getFaultSet() -{ - int csize = (int)Clist.size(); - for (int i = 0; i < csize; i++) - { - CMMClusterPtr c = Clist.at(i); - int truth = c->groundTruth; - std::vector list = c->points; - int psize = list.size(); - for (int j = 0; j < psize; j++) - { - CMMPointPtr p = list.at(j); - if (p->truth != truth) - { - faultSet.push_back(p); - if (!faultClu.contains(c)) - { - faultClu.insert(c); - faultClu.insert(CL.at(p->truth)); - } - } +void CMMDriver::getFaultSet() { + int csize = (int)Clist.size(); + for (int i = 0; i < csize; i++) { + CMMClusterPtr c = Clist.at(i); + int truth = c->groundTruth; + std::vector list = c->points; + int psize = list.size(); + for (int j = 0; j < psize; j++) { + CMMPointPtr p = list.at(j); + if (p->truth != truth) { + faultSet.push_back(p); + if (!faultClu.contains(c)) { + faultClu.insert(c); + faultClu.insert(CL.at(p->truth)); } + } } + } } -double CMMDriver::compCMM() -{ - getFaultSet(); - if (faultSet.empty()) - { - return 1; - } - compCon(); - double totalPen = 0; - double totalCon = 0; - int faultPsize = (int)faultSet.size(); - for (int i = 0; i < faultPsize; i++) - { - CMMPointPtr p = faultSet.at(i); - totalPen += p->weight * p->conCL * (1 - p->con); - totalCon += p->weight * p->conCL; - } - if (totalCon == 0) - { - return 0; - } - else - return 1 - totalPen / totalCon; +double CMMDriver::compCMM() { + getFaultSet(); + if (faultSet.empty()) { + return 1; + } + compCon(); + double totalPen = 0; + double totalCon = 0; + int faultPsize = (int)faultSet.size(); + for (int i = 0; i < faultPsize; i++) { + CMMPointPtr p = faultSet.at(i); + totalPen += p->weight * p->conCL * (1 - p->con); + totalCon += p->weight * p->conCL; + } + if (totalCon == 0) { + return 0; + } else + return 1 - totalPen / totalCon; } -void CMMDriver::compCon() -{ - for (auto &c : faultClu) - { - c->getConn(); - } +void CMMDriver::compCon() { + for (auto &c : faultClu) { + c->getConn(); + } } -double CMMDriver::computeWeight(double deltaTime) -{ - const double belta = 2; - const double lamda = 1; - return pow(belta, lamda * (deltaTime)); +double CMMDriver::computeWeight(double deltaTime) { + const double belta = 2; + const double lamda = 1; + return pow(belta, lamda * (deltaTime)); } // template