Skip to content

Commit

Permalink
Some progress on wiki extraction (#8) : get wikitext from page, parse…
Browse files Browse the repository at this point in the history
… sections, parse table, parse infobox
  • Loading branch information
rom1504 committed Mar 28, 2015
1 parent c913d17 commit cec349b
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 1 deletion.
10 changes: 10 additions & 0 deletions bin/wiki_extractor/block_parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
var WikiTextParser = require('./wikitext_parser');

var wikiTextParser = new WikiTextParser();

wikiTextParser.getArticle("Stone",function(data){
var sectionObject=wikiTextParser.pageToSectionObject(data);

var infoBox=wikiTextParser.parseInfoBox(sectionObject["content"]);
console.log(infoBox);
});
12 changes: 12 additions & 0 deletions bin/wiki_extractor/blocks_parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
var WikiTextParser = require('./wikitext_parser');

var wikiTextParser = new WikiTextParser();

wikiTextParser.getArticle("Blocks",function(data){
var sectionObject=wikiTextParser.pageToSectionObject(data);

var overworldNaturallyGenerated=sectionObject["World-generated blocks"]["The Overworld"]["Naturally generated"]["content"];
var table=wikiTextParser.parseTable(overworldNaturallyGenerated);
var linkTable=table.map(function(values){return values[2];});
console.log(linkTable);
});
108 changes: 108 additions & 0 deletions bin/wiki_extractor/test_wikitest_parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
var WikiTextParser = require('./wikitext_parser');
var _ = require('underscore');


var wikiTextParser = new WikiTextParser();

var testInput = "text_abstract\n" +
"= premiere =\n" +
"blabla\n" +
"== deuxieme ==\n" +
"text\n" +
"== troisieme ==\n" +
"text2\n" +
"= quatrieme =\n" +
"text3\n";

var testOutput = {
"content": ["text_abstract"],
"premiere": {
"content":["blabla"],
"deuxieme": {
"content": ["text"]
},
"troisieme": {
"content": ["text2"]
}
},
"quatrieme": {
"content":["text3",""]
}
};

var testInput2 = "text_abstract\n" +
"= premiere =\n" +
"blabla\n" +
"=== deuxieme ===\n" +
"text\n" +
"=== troisieme ===\n" +
"text2\n" +
"= quatrieme =\n" +
"text3\n";

var testOutput2 = {
"content": ["text_abstract"],
"premiere": {
"content":["blabla"],
"deuxieme": {
"content": ["text"]
},
"troisieme": {
"content": ["text2"]
}
},
"quatrieme": {
"content":["text3",""]
}
};

var testInput3 = "text_abstract\n" +
"= premiere =\n" +
"blabla\n" +
"=== deuxieme ===\n" +
"text\n" +
"=== troisieme ===\n" +
"text7\n" +
"=== 6666 ===\n" +
"text2\n" +
"= quatrieme =\n" +
"text3\n";

var testOutput3 = {
"content": ["text_abstract"],
"premiere": {
"content":["blabla"],
"deuxieme": {
"content": ["text"]
},
"troisieme": {
"content": ["text7"]
},
"6666": {
"content": ["text2"]
}
},
"quatrieme": {
"content":["text3",""]
}
};

function test(input, expectedOutput) {
var actualOutput = wikiTextParser.pageToSectionObject(input);
if(_.isEqual(actualOutput,expectedOutput))
{
return true;
}
else
{
console.log("actual output :");
console.log(JSON.stringify(actualOutput));
console.log("expected output :");
console.log(JSON.stringify(expectedOutput));
return false;
}
}

test(testInput, testOutput);
test(testInput2, testOutput2);
test(testInput3, testOutput3);
113 changes: 113 additions & 0 deletions bin/wiki_extractor/wikitext_parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
var bot = require('nodemw');
var _ = require('underscore');

module.exports = WikiTextParser;

function WikiTextParser()
{
this.client= new bot({
server: 'minecraft.gamepedia.com',
path: '/',
debug: false
});
}



WikiTextParser.prototype.getArticle=function(title,cb)
{
this.client.getArticle(title, function(err, data) {
if (err) {
console.error(err);
return;
}
cb(data);
});
};


function extractTitle(titleLine) {
return titleLine.replace(/=|\[|\]/g,"").trim();
}

function extractDepth(titleLine) {
var depth = 0;
for (var x = 0; x < titleLine.length; x++) {
var c = titleLine.charAt(x);
if (c == '=')
depth++;
else return depth;
}
}

function findFirstDefinedDepth(sections,currentDepth)
{
for(var i=currentDepth-1;i>=0;i--)
{
if(i in sections)
return i;
}
return -1;
}


if (typeof String.prototype.startsWith != 'function') {
String.prototype.startsWith = function (str) {
return this.slice(0, str.length) == str;
};
}
WikiTextParser.prototype.pageToSectionObject = function(data) {
var currentDepth = 0;
var a = data.split("\n");
var m = {};
var currentLineArray = [];
var currentTitle = "";
var currentSection = {};
var previousSections = {0: m};//indexed by depth
a.forEach(function (line) {
if (line.startsWith("=")) {
if (currentLineArray.length != 0) {
previousSections[currentDepth]["content"] = currentLineArray;
currentLineArray = [];
}
currentTitle = extractTitle(line);
currentDepth = extractDepth(line);

var p=findFirstDefinedDepth(previousSections,currentDepth);
var newSection={};
previousSections[p][currentTitle] = newSection;
previousSections[currentDepth] = newSection;
currentSection = newSection;
}
else
currentLineArray.push(line);
});
previousSections[currentDepth]["content"] = currentLineArray;
return m;
};


WikiTextParser.prototype.parseTable = function(sectionLineArray)
{
var array=[];
sectionLineArray.forEach(function(line){
if(line.startsWith("{{") && line != "{{-}}")
array.push(line.replace(/{|}/g,"").split("|"));
});
return array;
};

WikiTextParser.prototype.parseInfoBox = function(sectionLineArray)
{
var infoBox={};
infoBox["values"]={};
sectionLineArray.forEach(function(line) {
if (line.startsWith("{{"))
infoBox["template"] = line.replace(/{|}/g, "");
if (line.startsWith("|")) {
var keyAndValue=line.replace(/\|/g,"").split("=");
infoBox["values"][keyAndValue[0]]=keyAndValue[1];
}
});
return infoBox;
};
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
"license": "MIT",
"devDependencies": {
"jsonschema": "~1.0.1",
"mocha": "~2.2.1"
"mocha": "~2.2.1",
"nodemw": "~0.4.2",
"underscore" : "~1.8.2"
}
}

0 comments on commit cec349b

Please sign in to comment.