diff --git a/pom.xml b/pom.xml index ddbc07727f..df2bd21ca1 100644 --- a/pom.xml +++ b/pom.xml @@ -350,5 +350,10 @@ commons-pool2 2.6.0 + + org.wikidata.wdtk + wdtk-dumpfiles + 0.10.0 + diff --git a/src/main/java/io/anserini/kg/CountWikidataDocuments.java b/src/main/java/io/anserini/kg/CountWikidataDocuments.java new file mode 100644 index 0000000000..0947353047 --- /dev/null +++ b/src/main/java/io/anserini/kg/CountWikidataDocuments.java @@ -0,0 +1,108 @@ +/** + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.kg; + +import org.apache.commons.lang3.time.DurationFormatUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.kohsuke.args4j.CmdLineException; +import org.kohsuke.args4j.CmdLineParser; +import org.kohsuke.args4j.Option; +import org.kohsuke.args4j.OptionHandlerFilter; +import org.kohsuke.args4j.ParserProperties; +import org.wikidata.wdtk.dumpfiles.DumpProcessingController; +import org.wikidata.wdtk.dumpfiles.MwLocalDumpFile; +import org.wikidata.wdtk.datamodel.interfaces.*; + +import java.util.concurrent.TimeUnit; + +/** + * Class for counting documents from a Wikidata dump. Illustrates usage of Wikidata tools. + */ +public class CountWikidataDocuments { + private static final Logger LOG = LogManager.getLogger(CountWikidataDocuments.class); + + static final class Args { + @Option(name = "-input", metaVar = "[path]", required = true, usage = "path to dump file") + private String input; + } + + public static void main(String[] args) throws Exception { + Args dumpArgs = new Args(); + CmdLineParser parser = new CmdLineParser(dumpArgs, ParserProperties.defaults().withUsageWidth(90)); + + try { + parser.parseArgument(args); + } catch (CmdLineException e) { + System.err.println(e.getMessage()); + parser.printUsage(System.err); + System.err.println("Example: "+ LookupFreebaseNodes.class.getSimpleName() + + parser.printExample(OptionHandlerFilter.REQUIRED)); + return; + } + + // This code is adapted from https://github.com/Wikidata/Wikidata-Toolkit-Examples/blob/master/src/examples/LocalDumpFileExample.java + DumpProcessingController dumpProcessingController = new DumpProcessingController("wikidatawiki"); + CountingEntityDocumentProcessor processor = new CountingEntityDocumentProcessor(); + dumpProcessingController.registerEntityDocumentProcessor(processor,"wikidatawiki", true); + + final long start = System.nanoTime(); + LOG.info("Starting to process dump..."); + + MwLocalDumpFile dumpFile = new MwLocalDumpFile(dumpArgs.input); + dumpProcessingController.processDump(dumpFile); + + LOG.info(processor.itemCount + " total items encountered"); + LOG.info(processor.lexemeCount + " total lexemes encountered"); + LOG.info(processor.propertyCount + " total properties encountered"); + long duration = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); + LOG.info("Dump processed in " + DurationFormatUtils.formatDuration(duration, "HH:mm:ss")); + } + + private static class CountingEntityDocumentProcessor implements EntityDocumentProcessor { + public int itemCount = 0; + public int lexemeCount = 0; + public int propertyCount = 0; + + // Items are Entities that are typically represented by a Wikipage. + // See https://www.mediawiki.org/wiki/Wikibase/DataModel#Items + public void processItemDocument​(ItemDocument itemDocument) { + itemCount++; + if (itemCount % 1000000 == 0) { + LOG.info(itemCount + " items encountered"); + } + } + + // A Lexeme is a lexical element of a language, such as a word, a phrase, or a prefix. + // See https://www.wikidata.org/wiki/Wikidata:Lexicographical_data/Documentation + public void processLexemeDocument​(LexemeDocument lexemeDocument) { + lexemeCount++; + if (lexemeCount % 1000000 == 0) { + LOG.info(lexemeCount + " lexemes encountered"); + } + } + + // Properties are Entities that describe a relationship between Items (or other Entities) and Values of the property. + // See https://www.mediawiki.org/wiki/Wikibase/DataModel#Properties + public void processPropertyDocument​(PropertyDocument propertyDocument) { + propertyCount++; + if (propertyCount % 1000000 == 0) { + LOG.info(propertyCount + " properties encountered"); + } + } + } +} \ No newline at end of file