diff --git a/pom.xml b/pom.xml
index ddbc07727f..df2bd21ca1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -350,5 +350,10 @@
commons-pool2
2.6.0
+
+ org.wikidata.wdtk
+ wdtk-dumpfiles
+ 0.10.0
+
diff --git a/src/main/java/io/anserini/kg/CountWikidataDocuments.java b/src/main/java/io/anserini/kg/CountWikidataDocuments.java
new file mode 100644
index 0000000000..0947353047
--- /dev/null
+++ b/src/main/java/io/anserini/kg/CountWikidataDocuments.java
@@ -0,0 +1,108 @@
+/**
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.kg;
+
+import org.apache.commons.lang3.time.DurationFormatUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.kohsuke.args4j.OptionHandlerFilter;
+import org.kohsuke.args4j.ParserProperties;
+import org.wikidata.wdtk.dumpfiles.DumpProcessingController;
+import org.wikidata.wdtk.dumpfiles.MwLocalDumpFile;
+import org.wikidata.wdtk.datamodel.interfaces.*;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Class for counting documents from a Wikidata dump. Illustrates usage of Wikidata tools.
+ */
+public class CountWikidataDocuments {
+ private static final Logger LOG = LogManager.getLogger(CountWikidataDocuments.class);
+
+ static final class Args {
+ @Option(name = "-input", metaVar = "[path]", required = true, usage = "path to dump file")
+ private String input;
+ }
+
+ public static void main(String[] args) throws Exception {
+ Args dumpArgs = new Args();
+ CmdLineParser parser = new CmdLineParser(dumpArgs, ParserProperties.defaults().withUsageWidth(90));
+
+ try {
+ parser.parseArgument(args);
+ } catch (CmdLineException e) {
+ System.err.println(e.getMessage());
+ parser.printUsage(System.err);
+ System.err.println("Example: "+ LookupFreebaseNodes.class.getSimpleName() +
+ parser.printExample(OptionHandlerFilter.REQUIRED));
+ return;
+ }
+
+ // This code is adapted from https://github.com/Wikidata/Wikidata-Toolkit-Examples/blob/master/src/examples/LocalDumpFileExample.java
+ DumpProcessingController dumpProcessingController = new DumpProcessingController("wikidatawiki");
+ CountingEntityDocumentProcessor processor = new CountingEntityDocumentProcessor();
+ dumpProcessingController.registerEntityDocumentProcessor(processor,"wikidatawiki", true);
+
+ final long start = System.nanoTime();
+ LOG.info("Starting to process dump...");
+
+ MwLocalDumpFile dumpFile = new MwLocalDumpFile(dumpArgs.input);
+ dumpProcessingController.processDump(dumpFile);
+
+ LOG.info(processor.itemCount + " total items encountered");
+ LOG.info(processor.lexemeCount + " total lexemes encountered");
+ LOG.info(processor.propertyCount + " total properties encountered");
+ long duration = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
+ LOG.info("Dump processed in " + DurationFormatUtils.formatDuration(duration, "HH:mm:ss"));
+ }
+
+ private static class CountingEntityDocumentProcessor implements EntityDocumentProcessor {
+ public int itemCount = 0;
+ public int lexemeCount = 0;
+ public int propertyCount = 0;
+
+ // Items are Entities that are typically represented by a Wikipage.
+ // See https://www.mediawiki.org/wiki/Wikibase/DataModel#Items
+ public void processItemDocument​(ItemDocument itemDocument) {
+ itemCount++;
+ if (itemCount % 1000000 == 0) {
+ LOG.info(itemCount + " items encountered");
+ }
+ }
+
+ // A Lexeme is a lexical element of a language, such as a word, a phrase, or a prefix.
+ // See https://www.wikidata.org/wiki/Wikidata:Lexicographical_data/Documentation
+ public void processLexemeDocument​(LexemeDocument lexemeDocument) {
+ lexemeCount++;
+ if (lexemeCount % 1000000 == 0) {
+ LOG.info(lexemeCount + " lexemes encountered");
+ }
+ }
+
+ // Properties are Entities that describe a relationship between Items (or other Entities) and Values of the property.
+ // See https://www.mediawiki.org/wiki/Wikibase/DataModel#Properties
+ public void processPropertyDocument​(PropertyDocument propertyDocument) {
+ propertyCount++;
+ if (propertyCount % 1000000 == 0) {
+ LOG.info(propertyCount + " properties encountered");
+ }
+ }
+ }
+}
\ No newline at end of file