diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt new file mode 100644 index 0000000..8f94e92 --- /dev/null +++ b/benchmark/requirements.txt @@ -0,0 +1,4 @@ +pandas==1.3.3 +glob2==0.7 +dtaidistance==2.3.10 +numpy==1.21.2 \ No newline at end of file diff --git a/joss/paper.bib b/joss/paper.bib index 6bcdb62..ff01207 100644 --- a/joss/paper.bib +++ b/joss/paper.bib @@ -42,11 +42,13 @@ @article{Petitjean2011 year = {2011}, } -@article{meert2020wannesm, - title={wannesm/dtaidistance v2. 0.0}, - author={Meert, Wannes and Hendrickx, Kilian and Van Craenendonck, Toon}, +@article{meert2022wannesm, + title={wannesm/dtaidistance v2.3.10}, + author={Meert, Wannes and Hendrickx, Kilian and Van Craenendonck, Toon and Robberechts, Pieter and Blockeel, Hendrik and Davis, Jesse}, journal={Zenodo}, - year={2020} + year={2022}, + doi={10.5281/zenodo.1202378}, + url={https://zenodo.org/records/7158824} } @inproceedings{Shahapure2020, @@ -62,6 +64,18 @@ @inproceedings{Shahapure2020 year = {2020}, } +@article{dau2019ucr, + title={The UCR time series archive}, + author={Dau, Hoang Anh and Bagnall, Anthony and Kamgar, Kaveh and Yeh, Chin-Chia Michael and Zhu, Yan and Gharghabi, Shaghayegh and Ratanamahatana, Chotirat Ann and Keogh, Eamonn}, + journal={IEEE/CAA Journal of Automatica Sinica}, + volume={6}, + number={6}, + pages={1293--1305}, + year={2019}, + publisher={IEEE}, + doi={10.1109/JAS.2019.1911747} +} + @misc{UCRArchive2018, title = {The UCR Time Series Classification Archive}, author = {Dau, Hoang Anh and Keogh, Eamonn and Kamgar, Kaveh and Yeh, Chin-Chia Michael and Zhu, Yan @@ -111,14 +125,6 @@ @misc{Tavenard2020 year = {2020}, } -@misc{Dau2018, - author = {Hoang Anh Dau and Eamonn Keogh and Kaveh Kamgar and Chin-Chia Michael Yeh and Yan Zhu and Shaghayegh Gharghabi and Chotirat Ann Ratanamahatana and Yanping and Bing Hu and Nurjahan Begum and Anthony Bagnall and Abdullah Mueen and Gustavo Batista}, - month = {10}, - title = {The UCR Time Series Classification Archive}, - url = {https://www.cs.ucr.edu/~eamonn/time_series_data_2018/}, - year = {2018}, -} - @article{Huangfu2018, abstract = {This paper introduces the design and implementation of two parallel dual simplex solvers for general large scale sparse linear programming problems. One approach, called PAMI, extends a relatively unknown pivoting strategy called suboptimization and exploits parallelism across multiple iterations. The other, called SIP, exploits purely single iteration parallelism by overlapping computational components when possible. Computational results show that the performance of PAMI is superior to that of the leading open-source simplex solver, and that SIP complements PAMI in achieving speedup when PAMI results in slowdown. One of the authors has implemented the techniques underlying PAMI within the FICO Xpress simplex solver and this paper presents computational results demonstrating their value. In developing the first parallel revised simplex solver of general utility, this work represents a significant achievement in computational optimization.}, author = {Q. Huangfu and J. A.J. Hall}, @@ -138,7 +144,7 @@ @article{Huangfu2018 @misc{gurobi, author = {{Gurobi Optimization, LLC}}, title = {{Gurobi Optimizer Reference Manual}}, - year = 2023, + year = 2024, url = "https://www.gurobi.com" } @@ -149,6 +155,7 @@ @misc{kumtepeli2023fast eprint={2307.04904}, archivePrefix={arXiv}, primaryClass={eess.SP} + doi = {10.48550/arXiv.2307.04904} } @article{ROUSSEEUW198753, @@ -158,7 +165,7 @@ @article{ROUSSEEUW198753 pages = {53-65}, year = {1987}, issn = {0377-0427}, -doi = {https://doi.org/10.1016/0377-0427(87)90125-7}, +doi = {10.1016/0377-0427(87)90125-7}, url = {https://www.sciencedirect.com/science/article/pii/0377042787901257}, author = {Peter J. Rousseeuw}, keywords = {Graphical display, cluster analysis, clustering validity, classification}, diff --git a/joss/paper.md b/joss/paper.md index 60be446..1efbf9a 100644 --- a/joss/paper.md +++ b/joss/paper.md @@ -33,7 +33,7 @@ Time-series data analysis is of interest in a huge number of different applicati The target audience for this software is very broad, since clustering of time series data is relevant in many applications from energy to finance and medicine. However, as data availability increases, so does the complexity of the clustering problem. Most time series clustering algorithms depend on dimension reduction or feature extraction techniques to enable scaling to large datasets, but this can induce bias in the clustering [@Aghabozorgi2015]. Dynamic time warping [@Sakoe1978] is a well-known technique for manipulating time series to enable comparisons between datasets, using local warping (stretching or compressing along the time axis) of the elements within each time series to find an optimal alignment between series. This emphasises the similarity of the shapes of the respective time series rather than the exact alignment of specific features. Unfortunately, DTW does not scale well in computational speed as the length and number of time series to be compared increases---the computational complexity grows quadratically with the total number of data points. This is a barrier to DTW being widely implemented in large-scale time series clustering [@Rajabi2020]. In response, `DTW-C++` was written to handle large time series efficiently, directly processing the raw data rather than first extracting features. -In contrast to existing tools available for time series clustering using DTW, such as `DTAIDistance` [@meert2020wannesm] and `TSlearn` [@Tavenard2020], `DTW-C++` offers significant improvements in speed and memory use, enabling larger datasets to be clustered. This is achieved by +In contrast to existing tools available for time series clustering using DTW, such as `DTAIDistance` [@meert2022wannesm] and `TSlearn` [@Tavenard2020], `DTW-C++` offers significant improvements in speed and memory use, enabling larger datasets to be clustered. This is achieved by 1. task-level parallelisation, where multiple pairwise comparisons between time series can be evaluated simultaneously, and, 2. improved memory management---since the clustering algorithm only needs the final distance computed between pairwise time series, the DTW distance computation stores only the most recent previous vector, rather than the entire warping matrix. @@ -121,16 +121,16 @@ $$ This integer program is solved in `DTW-C++` using Gurobi [@gurobi] or HiGHS [@Huangfu2018]. After solution, the non-zero diagonal entries of $A$ represent the centroids, and the non-zero elements in the corresponding columns in $A$ represent the members of that cluster. In the example in \autoref{fig:A_matrix}, the clusters are time series 1, **2**, 5 and 3, **4** with the bold type face entries indicating the centroids. -Finding a globally optimal solution with this method can result in increased computation times depending on the number of time series within the dataset and the DTW distances. Therefore, there is also a built-in option to cluster using k-medoids, as used in other packages such as \texttt{DTAIDistance} [@meert2020wannesm]. The k-medoids method is often quicker as it is an iterative approach, however it is subject to getting stuck in local optima. The results in the next section show the timing and memory performance of both integer programming clustering and k-medoids clustering using \texttt{DTW-C++} compared to other packages. +Finding a globally optimal solution with this method can result in increased computation times depending on the number of time series within the dataset and the DTW distances. Therefore, there is also a built-in option to cluster using k-medoids, as used in other packages such as \texttt{DTAIDistance} [@meert2022wannesm]. The k-medoids method is often quicker as it is an iterative approach, however it is subject to getting stuck in local optima. The results in the next section show the timing and memory performance of both integer programming clustering and k-medoids clustering using \texttt{DTW-C++} compared to other packages. # Comparison -We compared our approach with two other DTW clustering packages, \texttt{DTAIDistance} [@meert2020wannesm] and \texttt{TSlearn} [@Tavenard2020] using data from the UCR Time Series Classification Archive [@Dau2018], which consists of 128 time series datasets with up to 16,800 data series of lengths up to 2,844. Benchmarking against \texttt{TSlearn} was stopped after the first 22 datasets because the results were consistently over 20 times slower than \texttt{DTW-C++}. \autoref{tab:small_table} shows the results for datasets downselected to have the number of time series, $N$, greater than 100, and the length of each time series greater than 500 points. This is because \texttt{DTW-C++} is aimed at larger datasets where the speed improvements are more relevant. +We compared our approach with two other DTW clustering packages, \texttt{DTAIDistance} [@meert2022wannesm] and \texttt{TSlearn} [@Tavenard2020] using data from the UCR Time Series Classification Archive [@UCRArchive2018, @dau2019ucr], which consists of 128 time series datasets with up to 16,800 data series of lengths up to 2,844. Benchmarking against \texttt{TSlearn} was stopped after the first 22 datasets because the results were consistently over 20 times slower than \texttt{DTW-C++}. \autoref{tab:small_table} shows the results for datasets downselected to have the number of time series, $N$, greater than 100, and the length of each time series greater than 500 points. This is because \texttt{DTW-C++} is aimed at larger datasets where the speed improvements are more relevant. \begin{table}[] \resizebox{\textwidth}{!}{% \begin{tabular}{l|p{.125\textwidth}p{.125\textwidth}p{.125\textwidth}p{.125\textwidth}p{.125\textwidth}p{.125\textwidth}} - & Number of time series & Length of time series & DTW-C++ IP (s) & DTW-C++ k-Medoids (s) & DTAI Distance (s) & Time decrease (\%) \\ + & Number of time series & Length of time series & DTW-C++ IP (s) & DTW-C++ k-Medoids (s) & DTAI Distance* (s) & Time decrease (\%) \\ \hline CinCECGTorso & 1380 & 1639 & 3008.4 & \textbf{1104.2} & 1955.9 & 44 \\ Computers & 250 & 720 & 16.1 & \textbf{10.5} & 12.8 & 18 \\ @@ -162,7 +162,8 @@ SemgHandSubjectCh2 & 450 & 1500 & 186.4 & \textbf{96.7} & 177 ShapesAll & 600 & 512 & 67.5 & \textbf{15.1} & 44.4 & 66 \\ SmallKitchenAppliances & 375 & 720 & 41.7 & \textbf{23.8} & 30.1 & 21 \\ StarLightCurves & 8236 & 1024 & N/A & \textbf{18551.7} & 27558.1 & 33 \\ -UWaveGestureLibraryAll & 3582 & 945 & N/A & \textbf{1194.6} & 4436.9 & 73 +UWaveGestureLibraryAll & 3582 & 945 & N/A & \textbf{1194.6} & 4436.9 & 73 \\ \hline +\multicolumn{7}{l}{*Benchmark results for Python libraries \textit{may} include an overhead of 10\% due to the usage of the \textit{tracemalloc} library.} \end{tabular}} \caption{Computational time comparison between \texttt{DTW-C++} using integer programming and k-medoids, vs.\ \texttt{DTAIDistance}, and \texttt{TSlearn}, on datasets in the UCR Time Series Classification Archive where $N>100$ and $L>500$. The fastest result for each dataset is in bold type.} \label{tab:small_table} @@ -181,7 +182,7 @@ The performance comparison on all datasets in the UCR Time Series Classification # Acknowledgements -We are grateful for discussions of this topic with [Battery Intelligence Lab](https://howey.eng.ox.ac.uk) members, and thank BBOXX for project funding and access to data. This work was also funded by the UKRI PFER Energy Superhub Oxford demonstrator and the ``Data-driven exploration of the carbon emissions impact of grid energy storage deployment and dispatch'' project (EP/W027321/1). The authors would like to particularly thank Dau, Keogh, et al. for their extensive efforts in compiling a diverse range of datasets for the UCR Time Series Classification Archive. +We are grateful for discussions of this topic with [Battery Intelligence Lab](https://howey.eng.ox.ac.uk) members, and thank BBOXX for project funding and access to data. This work was also funded by the UKRI PFER Energy Superhub Oxford demonstrator and the ``Data-driven exploration of the carbon emissions impact of grid energy storage deployment and dispatch'' project (EP/W027321/1). The authors would like to particularly thank Dau, Keogh, et al. for their extensive efforts in compiling a diverse range of datasets for the [UCR Time Series Classification Archive](https://www.cs.ucr.edu/~eamonn/time_series_data_2018/). # References