diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..667e9f7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +*.class + +# Package Files # +*.jar +*.war +*.ear + +/target + +Thumbs.db +.DS_Store + +*~ +*.bak \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6ad0f58 --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +# Sparesort + +**Sparesort** is a Java library for performing sequential pattern mining, which is a data mining problem for analyzing sequencing or time related processes, e.g. customer purchase or web access patterns. +This library implements BIDE, an efficient algorithm for mining frequent closed sequences. + +## System Requirements + +* Java Runtime Environment (JRE) 1.5 or higher + +## Usage + +### From console + +The distribution contains an executable JAR (`sparesort-X.Y.Z.jar`). +You can simply try the argorithm by running `java -jar` command in your console. + + java -classpath commons-lang3-3.1.jar:slf4j-api-1.6.4.jar \ + -jar sparesort-X.Y.Z.jar \ + min_sup sequence1 sequence2 sequence3 ... + +Note that dependent libraries ([Commons Lang](http://commons.apache.org/proper/commons-lang/) and [SLF4J](http://www.slf4j.org/)) are provided by the classpath argument. +This console application takes the following two kinds of arguments: + +* The first argument `min_sup` is an absolute minimium support, an integer larger than 1. +* The other argumets `sequence1 sequence2 sequence3 ...` are paths to sequence files. Each file contains one sequence of string items separated by new lines. + +After mining is finished, all frequent closed sequences qualified by the threshold and their frequency are printed to the standard output. + +### Within your code + +1. Create an instance of `jp.ac.titech.cs.se.sparesort.SequenceDatabase` which supports `addSequence(List)` and `addSequence(T...)` methods to load item sequences. + + // Create and load seqauence database + SequenceDatabase sdb = new SequenceDatabase() + .addSequence("C", "A", "A", "B", "C") + .addSequence("A", "B", "C", "B") + .addSequence("C", "A", "B", "C") + .addSequence("A", "B", "B", "C", "A"); + +2. (Optional) Set an alternative mining strategy. When no strategy is given, `jp.ac.titech.cs.se.sparesort.bide.RecursiveBIDE` is used as a default strategy to perform depth-first mining based on recursive calls. + + sdb.setMiningStrategy(new RecursiveBIDE()); + +3. Execute `mineFrequentClosedSequences(int)` method giving an absolute minimum support threshold as its argument. +The return value is a mapping from frequent sequences found in the sequence database and their frequency. + + Map, Integer> result = sdb.mineFrequentClosedSequences(2); + +## Compile and build + +Sparesort is built by using [Apache Maven](http://maven.apache.org/). +Before you proceed, download Maven and configure the environment variable `M2_HOME` properly. + +1. Fork (or download) the source code. +2. Open your terminal or command prompt, move to the directory containing pom.xml and hit `mvn package` there. +3. After maven build succeeds, you will find `sparesort-X.Y.Z.jar` in `target/` subdirectory. + +## References + +BIDE algorithm was presented by Wang and Han in the follwoing paper: + +* Jianyong Wang, Jiawei Han: “BIDE: Efficient mining of frequent closed sequences”. In Proceedings of the 20th International Conference on Data Engineering (ICDE 2004), 30 March - 2 April 2004, Boston, USA, pp. 79–90. [doi:10.1109/ICDE.2004.1319986](http://dx.doi.org/10.1109/ICDE.2004.1319986) + +Sparesort is designed for stimulating researchers in software and data engineering. +If you use this library in your research and publications, please cite the following article as its source: + +* Hiroshi Kazato, Shinpei Hayashi, Tsuyoshi Oshima, Shunsuke Miyata, Takashi Hoshino, Motoshi Saeki: "Extracting and Visualizing Implementation Structure of Features". In Proceedings of the 20th Asia-Pacific Software Engineering Conference (APSEC 2013). Bangkok, Thailand, dec, 2013. + +## Copyright and license + +Copyright (c) 2010-2012 [Saeki Lab.](http://www.se.cs.titech.ac.jp/) at [Tokyo Institute of Technology](http://www.titech.ac.jp/). +Sparesort is an open source software, licensed under the [Apache Licese, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). diff --git a/pom.xml b/pom.xml new file mode 100755 index 0000000..ad25c6d --- /dev/null +++ b/pom.xml @@ -0,0 +1,50 @@ + + 4.0.0 + jp.ac.titech.cs.se + sparesort + 0.2.0 + + Saeki Lab. at Tokyo Institute of Technology + http://www.se.cs.titech.ac.jp/ + + + + junit + junit + 4.10 + test + + + org.apache.commons + commons-lang3 + 3.1 + + + org.slf4j + slf4j-api + 1.6.4 + + + ch.qos.logback + logback-classic + 1.0.0 + runtime + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + true + jp.ac.titech.cs.se.sparesort.Main + + + + + + + \ No newline at end of file diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/Main.java b/src/main/java/jp/ac/titech/cs/se/sparesort/Main.java new file mode 100755 index 0000000..240ca72 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/Main.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; + +public class Main { + + public static void main(String[] args) throws Exception { + SequenceDatabase sdb = new SequenceDatabase(); + + int minSup = Integer.parseInt(args[0]); + for (int i = 1; i < args.length; i++) { + sdb.addSequence(loadStringListFromFile(args[i])); + } + + Map, Integer> result = sdb.mineFrequentClosedSequences(minSup); + for (Map.Entry, Integer> entry : result.entrySet()) { + System.out.println(entry.getKey() + ":" + entry.getValue()); + } + } + + public static List loadStringListFromFile(String path) + throws Exception { + List events = new ArrayList(); + BufferedReader reader = new BufferedReader(new FileReader(path)); + + String line = null; + while ((line = reader.readLine()) != null) { + events.add(StringUtils.chomp(line)); + } + + reader.close(); + return events; + } + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/MiningStrategy.java b/src/main/java/jp/ac/titech/cs/se/sparesort/MiningStrategy.java new file mode 100644 index 0000000..4045a09 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/MiningStrategy.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort; + +public interface MiningStrategy { + + public void mineFrequentClosedSequences(SequenceDatabase sdb, + int minSup, ResultHandler handler) throws Exception; + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/ResultHandler.java b/src/main/java/jp/ac/titech/cs/se/sparesort/ResultHandler.java new file mode 100755 index 0000000..42b1e33 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/ResultHandler.java @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort; + +import java.util.List; + +public interface ResultHandler { + + public void handle(List sequence, int frequency, SequenceDatabase sdb); + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/Sequence.java b/src/main/java/jp/ac/titech/cs/se/sparesort/Sequence.java new file mode 100755 index 0000000..3e0f2ba --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/Sequence.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort; + +import java.util.Collections; +import java.util.List; + +public class Sequence { + + private final String id; + + private final List events; + + private final int offset; + + public Sequence(String id, List events) { + super(); + this.id = id; + this.events = Collections.unmodifiableList(events); + this.offset = 0; + } + + private Sequence(Sequence parent, int offset) { + super(); + this.id = parent.id; + this.events = parent.events; + this.offset = offset; + } + + public Sequence getProjectedSequenceWithRespectTo(List prefix) { + int offset = locateFirstInstanceOf(prefix); + return (offset == -1) ? null : new Sequence(this, offset); + } + + public String getId() { + return id; + } + + public List getEvents() { + return (offset == 0) ? events : events.subList(offset, events.size()); + } + + private int locateFirstInstanceOf(List prefix) { + int length = events.size(); + int pos = 0; + prefix_loop: for (T prefix_i : prefix) { + while (pos < length) { + if (prefix_i.equals(events.get(pos++))) { + continue prefix_loop; + } + } + return -1; + } + return pos; + } + + private List getLastInstanceOf(List prefix) { + int index = events.lastIndexOf(prefix.get(prefix.size() - 1)); + return (index >= 0) ? events.subList(0, index + 1) : null; + } + + private int locateLastInLastAppearanceWithRespectTo(List prefix, int i) { + List lastInstance = getLastInstanceOf(prefix); + int pos = lastInstance.size(); + + prefix_loop: for (int index = prefix.size() - 1; index >= i; index--) { + T prefix_i = prefix.get(index); + while (--pos >= 0) { + if (prefix_i.equals(lastInstance.get(pos))) { + continue prefix_loop; + } + } + return -1; + } + return pos; + } + + public List getMaximumPeriodOf(List prefix, int i) { + int from = (i == 0) ? 0 : locateFirstInstanceOf(prefix.subList(0, i)); + int to = locateLastInLastAppearanceWithRespectTo(prefix, i); + return (from <= to) ? events.subList(from, to) : null; + } + + private int locateLastInFirstAppearanceWithRespectTo(List prefix, int i) { + int pos = offset; + + prefix_loop: for (int index = prefix.size() - 1; index >= i; index--) { + T prefix_i = prefix.get(index); + while (--pos >= 0) { + if (prefix_i.equals(events.get(pos))) { + continue prefix_loop; + } + } + return -1; + } + return pos; + } + + public List getSemiMaximumPeriodOf(List prefix, int i) { + int from = (i == 0) ? 0 : locateFirstInstanceOf(prefix.subList(0, i)); + int to = locateLastInFirstAppearanceWithRespectTo(prefix, i); + return (from <= to) ? events.subList(from, to) : null; + } + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/SequenceDatabase.java b/src/main/java/jp/ac/titech/cs/se/sparesort/SequenceDatabase.java new file mode 100755 index 0000000..527b9d3 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/SequenceDatabase.java @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +import jp.ac.titech.cs.se.sparesort.bide.RecursiveBIDE; + +public class SequenceDatabase { + + private final List prefix; + + private final List> sequences; + + private MiningStrategy strategy; + + public SequenceDatabase() { + this(null, new ArrayList>()); + } + + private SequenceDatabase(List prefix, List> sequences) { + this.prefix = prefix; + this.sequences = sequences; + } + + public SequenceDatabase getProjectedDatabaseWithRespectTo(List prefix) { + List> projections = new ArrayList>(); + for (Sequence sequence : sequences) { + Sequence ps = sequence.getProjectedSequenceWithRespectTo(prefix); + if (ps != null) { + projections.add(ps); + } + } + return projections.isEmpty() ? null : new SequenceDatabase(prefix, + projections); + } + + public List> getSequences() { + return sequences; + } + + public SequenceDatabase addSequence(String id, List events) { + sequences.add(new Sequence(id, events)); + return this; + } + + public SequenceDatabase addSequence(List events) { + return addSequence(String.valueOf(getSupportOfPrefix() + 1), events); + } + + public SequenceDatabase addSequence(T... events) { + return addSequence(Arrays.asList(events)); + } + + public List getPrefix() { + return prefix; + } + + public int getSupportOfPrefix() { + return (sequences == null) ? 0 : sequences.size(); + } + + public MiningStrategy getMiningStrategy() { + if (strategy == null) { + strategy = new RecursiveBIDE(); + } + return strategy; + } + + public void setMiningStrategy(MiningStrategy strategy) { + this.strategy = strategy; + } + + public Map, Integer> mineFrequentClosedSequences(int minSup) + throws Exception { + final Map, Integer> result = new HashMap, Integer>(); + + mineFrequentClosedSequences(minSup, new ResultHandler() { + public void handle(List sequence, int frequency, + SequenceDatabase sdb) { + result.put(sequence, frequency); + } + }); + + return result; + } + + public void mineFrequentClosedSequences(int minSup, ResultHandler handler) + throws Exception { + getMiningStrategy().mineFrequentClosedSequences(this, minSup, handler); + } + + public SortedMap getItemFrequencies(int minSup) { + SortedMap frequencies = new TreeMap(); + for (Sequence sequence : sequences) { + for (T item : new HashSet(sequence.getEvents())) { + int f = frequencies.containsKey(item) ? frequencies.get(item) + : 0; + frequencies.put(item, ++f); + } + } + + Iterator iter = frequencies.values().iterator(); + while (iter.hasNext()) { + if (iter.next() < minSup) { + iter.remove(); + } + } + return frequencies; + } + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/bide/BIDE.java b/src/main/java/jp/ac/titech/cs/se/sparesort/bide/BIDE.java new file mode 100644 index 0000000..363f4a6 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/bide/BIDE.java @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort.bide; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; + +import jp.ac.titech.cs.se.sparesort.ResultHandler; +import jp.ac.titech.cs.se.sparesort.MiningStrategy; +import jp.ac.titech.cs.se.sparesort.Sequence; +import jp.ac.titech.cs.se.sparesort.SequenceDatabase; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class BIDE implements MiningStrategy { + + protected static final Logger logger = LoggerFactory.getLogger(BIDE.class); + + protected List> bide(SequenceDatabase sdb, int minSup, + ResultHandler handler) { + List prefix = sdb.getPrefix(); + + if (prefix != null && backScan(sdb)) { + return Collections.emptyList(); + } + + SortedMap frequencies = sdb.getItemFrequencies(minSup); + if (isPrefixClosed(sdb, frequencies)) { + synchronized (handler) { + handler.handle(prefix, sdb.getSupportOfPrefix(), sdb); + } + } + + List> prefixes = new ArrayList>(); + for (Map.Entry entry : frequencies.entrySet()) { + List extendedPrefix = new ArrayList(); + if (prefix != null) { + extendedPrefix.addAll(prefix); + } + extendedPrefix.add(entry.getKey()); + prefixes.add(extendedPrefix); + } + return prefixes; + } + + protected boolean isPrefixClosed(SequenceDatabase sdb, + SortedMap frequencies) { + List prefix = sdb.getPrefix(); + if (prefix == null) { + return false; + } + + int support = sdb.getSupportOfPrefix(); + if (logger.isDebugEnabled()) { + logger.debug("Testing: {}:{}", prefix, support); + } + + for (Map.Entry entry : frequencies.entrySet()) { + if (entry.getValue() == support) { + if (logger.isTraceEnabled()) { + logger.trace( + "Open: The prefix can be extended by a forward extension item {}.", + entry.getKey()); + } + return false; /* Found a forward extension item */ + } + } + + if (!hasBackwardExtensionItem(sdb)) { + if (logger.isDebugEnabled()) { + logger.debug("Closed: {}:{}", prefix, support); + } + return true; + } + return false; + } + + protected boolean hasBackwardExtensionItem(SequenceDatabase sdb) { + List prefix = sdb.getPrefix(); + prefix_loop: for (int i = 0; i < prefix.size(); i++) { + Set intersection = null; + + for (Sequence sequence : sdb.getSequences()) { + List mp_i = sequence.getMaximumPeriodOf(prefix, i); + if (mp_i == null || mp_i.isEmpty()) { + continue prefix_loop; + } + + if (intersection == null) { + intersection = new HashSet(mp_i); + } else { + intersection.retainAll(mp_i); + if (intersection.isEmpty()) { + continue prefix_loop; + } + } + } + + if (logger.isTraceEnabled()) { + logger.trace( + "Open: The prefix can be extended by backward extension items {}, found in each of the {} maximum period.", + intersection, toOrdinal(i)); + } + return true; + } + return false; + } + + protected boolean backScan(SequenceDatabase sdb) { + List prefix = sdb.getPrefix(); + prefix_loop: for (int i = 0; i < prefix.size(); i++) { + Set intersection = null; + + for (Sequence sequence : sdb.getSequences()) { + List smp_i = sequence.getSemiMaximumPeriodOf(prefix, i); + if (smp_i == null || smp_i.isEmpty()) { + continue prefix_loop; + } + + if (intersection == null) { + intersection = new HashSet(smp_i); + } else { + intersection.retainAll(smp_i); + if (intersection.isEmpty()) { + continue prefix_loop; + } + } + } + + if (logger.isTraceEnabled()) { + logger.trace( + "Pruned: Search space was pruned by the BackScan method. Items {} exists in each of the {} semi-maximum period.", + intersection, toOrdinal(i)); + } + return true; + } + return false; + } + + public static String toOrdinal(int i) { + int mod = i % 10; + return i + (mod == 1 ? "st" : mod == 2 ? "nd" : mod == 3 ? "rd" : "th"); + } + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/bide/ConcurrentBIDE.java b/src/main/java/jp/ac/titech/cs/se/sparesort/bide/ConcurrentBIDE.java new file mode 100644 index 0000000..0aff611 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/bide/ConcurrentBIDE.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort.bide; + +import java.util.ArrayList; +import java.util.List; + +import jp.ac.titech.cs.se.sparesort.ResultHandler; +import jp.ac.titech.cs.se.sparesort.SequenceDatabase; +import jp.ac.titech.cs.se.sparesort.util.RecursiveTask; +import jp.ac.titech.cs.se.sparesort.util.RecursiveTaskRunner; + +public class ConcurrentBIDE extends BIDE { + + public void mineFrequentClosedSequences(SequenceDatabase sdb, + int minSup, ResultHandler handler) throws Exception { + RecursiveTaskRunner runner = new RecursiveTaskRunner(); + runner.run(new BIDETask(sdb, minSup, handler)); + } + + private class BIDETask implements RecursiveTask { + + private final SequenceDatabase sdb; + + private final int minSup; + + private final ResultHandler handler; + + public BIDETask(SequenceDatabase sdb, int minSup, + ResultHandler handler) { + super(); + this.sdb = sdb; + this.minSup = minSup; + this.handler = handler; + } + + public List call() { + List subtasks = new ArrayList(); + for (List extendedPrefix : bide(sdb, minSup, handler)) { + subtasks.add(new BIDETask(sdb + .getProjectedDatabaseWithRespectTo(extendedPrefix), + minSup, handler)); + } + return subtasks; + } + } + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/bide/RecursiveBIDE.java b/src/main/java/jp/ac/titech/cs/se/sparesort/bide/RecursiveBIDE.java new file mode 100644 index 0000000..97bde16 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/bide/RecursiveBIDE.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort.bide; + +import java.util.List; + +import jp.ac.titech.cs.se.sparesort.ResultHandler; +import jp.ac.titech.cs.se.sparesort.SequenceDatabase; + +public class RecursiveBIDE extends BIDE { + + public void mineFrequentClosedSequences(SequenceDatabase sdb, + int minSup, ResultHandler handler) throws Exception { + applyBIDE(sdb, minSup, handler, 0, 100); + } + + private void applyBIDE(SequenceDatabase sdb, int minSup, + ResultHandler handler, double minProgress, double maxProgress) { + double progress = minProgress; + + List> prefixes = bide(sdb, minSup, handler); + int size = prefixes.size(); + for (int i = 0; i < size; i++) { + double nextProgress = minProgress + (maxProgress - minProgress) + * ((double) (i + 1)) / ((double) size); + applyBIDE(sdb.getProjectedDatabaseWithRespectTo(prefixes.get(i)), + minSup, handler, progress, nextProgress); + progress = nextProgress; + } + + if (logger.isDebugEnabled()) { + logger.debug(String.format("Progress: %.3f%%", progress)); + } + } + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/util/RecursiveTask.java b/src/main/java/jp/ac/titech/cs/se/sparesort/util/RecursiveTask.java new file mode 100644 index 0000000..1ed5767 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/util/RecursiveTask.java @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort.util; + +import java.util.List; +import java.util.concurrent.Callable; + +public interface RecursiveTask extends Callable> { + + public List call(); + +} diff --git a/src/main/java/jp/ac/titech/cs/se/sparesort/util/RecursiveTaskRunner.java b/src/main/java/jp/ac/titech/cs/se/sparesort/util/RecursiveTaskRunner.java new file mode 100755 index 0000000..04caec0 --- /dev/null +++ b/src/main/java/jp/ac/titech/cs/se/sparesort/util/RecursiveTaskRunner.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort.util; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +public class RecursiveTaskRunner { + + private final ExecutorService executor; + + private final CompletionService> queue; + + private final Set> waiting; + + public RecursiveTaskRunner() { + this(Executors.newCachedThreadPool()); + } + + public RecursiveTaskRunner(int nThreads) { + this(Executors.newFixedThreadPool(nThreads)); + } + + public RecursiveTaskRunner(ExecutorService executor) { + this.executor = executor; + this.queue = new ExecutorCompletionService>( + executor); + this.waiting = new HashSet>(); + } + + public void run(RecursiveTask task) throws Exception { + waiting.add(queue.submit(task)); + + while (!waiting.isEmpty()) { + Future> completed = queue.take(); + waiting.remove(completed); + + List subtasks = completed.get(); + if (subtasks != null) { + for (RecursiveTask subtask : subtasks) { + waiting.add(queue.submit(subtask)); + } + } + } + executor.shutdown(); + executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + } + +} diff --git a/src/test/java/jp/ac/titech/cs/se/sparesort/SequenceDatabaseTest.java b/src/test/java/jp/ac/titech/cs/se/sparesort/SequenceDatabaseTest.java new file mode 100755 index 0000000..dcd0aec --- /dev/null +++ b/src/test/java/jp/ac/titech/cs/se/sparesort/SequenceDatabaseTest.java @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2010-2012 Saeki Lab. at Tokyo Institute of Technology. + * All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package jp.ac.titech.cs.se.sparesort; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import jp.ac.titech.cs.se.sparesort.bide.ConcurrentBIDE; +import jp.ac.titech.cs.se.sparesort.bide.RecursiveBIDE; + +import org.junit.Test; + +public class SequenceDatabaseTest { + + @Test + public void testRecursiveBIDE() throws Exception { + SequenceDatabase fixture = new SequenceDatabase() + .addSequence("C", "A", "A", "B", "C") + .addSequence("A", "B", "C", "B") + .addSequence("C", "A", "B", "C") + .addSequence("A", "B", "B", "C", "A"); + + fixture.setMiningStrategy(new RecursiveBIDE()); + Map, Integer> result = fixture.mineFrequentClosedSequences(2); + + assertNotNull(result); + assertEquals(6, result.size()); + assertTrue(result.containsKey(Arrays.asList("A", "A"))); + assertTrue(result.containsKey(Arrays.asList("A", "B", "B"))); + assertTrue(result.containsKey(Arrays.asList("A", "B", "C"))); + assertTrue(result.containsKey(Arrays.asList("C", "A"))); + assertTrue(result.containsKey(Arrays.asList("C", "A", "B", "C"))); + assertTrue(result.containsKey(Arrays.asList("C", "B"))); + } + + @Test + public void testConcurrentBIDE() throws Exception { + SequenceDatabase fixture = new SequenceDatabase() + .addSequence("C", "A", "A", "B", "C") + .addSequence("A", "B", "C", "B") + .addSequence("C", "A", "B", "C") + .addSequence("A", "B", "B", "C", "A"); + + fixture.setMiningStrategy(new ConcurrentBIDE()); + Map, Integer> result = fixture.mineFrequentClosedSequences(2); + + assertNotNull(result); + assertEquals(6, result.size()); + assertTrue(result.containsKey(Arrays.asList("A", "A"))); + assertTrue(result.containsKey(Arrays.asList("A", "B", "B"))); + assertTrue(result.containsKey(Arrays.asList("A", "B", "C"))); + assertTrue(result.containsKey(Arrays.asList("C", "A"))); + assertTrue(result.containsKey(Arrays.asList("C", "A", "B", "C"))); + assertTrue(result.containsKey(Arrays.asList("C", "B"))); + } +} diff --git a/src/test/resources/logback-test.xml b/src/test/resources/logback-test.xml new file mode 100644 index 0000000..70d74ec --- /dev/null +++ b/src/test/resources/logback-test.xml @@ -0,0 +1,11 @@ + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + +