Skip to content

Commit

Permalink
Merge pull request #22 from dijs/parser
Browse files Browse the repository at this point in the history
Added parser to library so we don't have the request dependency
  • Loading branch information
dijs committed May 26, 2016
2 parents 5cccc9d + 7b3ae17 commit ccbb4ad
Show file tree
Hide file tree
Showing 5 changed files with 280 additions and 6 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ WikiJs is a node.js library which serves as an interface to Wikipedia (or any Me

- Search wiki articles
- Fetch article content
- Find all links/images/categories in a article page
- Find all links/images/categories in a article page
- Get parsed information about articles
- Find articles by geographical location
- and much more!
Expand Down Expand Up @@ -50,4 +50,15 @@ wiki.page('Batman').then(function(page) {
});
});

```
```

## Usage with webpack

In order for webpack to build wikijs properly, you must add an option to
your webpack configuration file. [Documentation](https://webpack.github.io/docs/configuration.html#externals)

```json
externals: {
"isomorphic-fetch": "fetch"
}
```
5 changes: 2 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "wikijs",
"description": "Wikipedia interface for node",
"author": "Richard van der Dys",
"version": "0.2.0",
"version": "0.2.1",
"keywords": [
"wiki",
"wikipedia",
Expand All @@ -21,8 +21,7 @@
"dependencies": {
"babel-polyfill": "^6.9.0",
"isomorphic-fetch": "^2.2.1",
"underscore": "^1.8.3",
"wiki-infobox-parser": "^0.1.11"
"underscore": "^1.8.3"
},
"devDependencies": {
"babel-cli": "^6.9.0",
Expand Down
225 changes: 225 additions & 0 deletions src/parser/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
var utils = require('./utils');

/**
* Main markup syndax parser
* @param {string} data markup text
* @param {Function} callback callback function
* @return {undefined} return undefined when error occurs
*/
module.exports = function(data, callback) {

var content;

/**********************************
* Parse scraping result, *
* which is in the format of JSON *
**********************************/

try {
content = JSON.parse(data);
} catch(e) {
callback(e);
return;
}

if (!content.query) { callback(new Error('Query Not Found')); return; }

/**
* Get JSON data
*/
var json = content.query.pages;
var key = Object.keys(json);

if (key.indexOf('-1') === 0) {
callback(new Error('Page Index Not Found'));
return;
} else if(!json[key]){
callback(new Error('Malformed Response Payload'));
return;
} else if (json[key].revisions[0]['*'].indexOf('REDIRECT') > -1) {
callback(new Error(json[key].revisions[0]['*']));
return;
}

/**
* Get the JSON data that contains infobox section
*/
var reg = new RegExp('{{[Ii]nfobox(.|\n)*}}', 'g');
var text = reg.exec(json[key].revisions[0]['*']);
if (!text) { callback(new Error('Infobox Not Found')); return; }
text = text[0];


/************************
* Remove useless marks *
************************/

/*
* Remove comments
*/
text = utils.replaceAll('<!--.*-->', '', text);
/*
* Remove reference
* TODO: support reference in advanced model
*/
text = utils.replaceAll('<ref.*(/>|>.*</ref>)', '', text);
/*
* Remove all HTML tags like '<br>', etc.
*/
text = utils.replaceAll('<[^>]+>', '', text);
/*
* Remove footnote
* TODO; support footnote in advanced model
*/
text = utils.replaceAll('\{\{refn[^\}\}]*?\}\}', '', text);

/*
* Merge order, bulleted, unbulleted, Pagelist
* list items to one line
*/
var lists = text.match(/\{\{(order|bulleted|unbulleted|Pagelist)(.*\n)*?\}\}/g);
if (lists && lists.length) {
lists.forEach(function(l) {
text = text.replace(l, l.replace('{{', '').replace('}}', '')
.replace(/(order|bulleted|unbulleted)\slist\n\|/g, '')
.split('\n|').join(', '));
});
}

/*
* Parse URL
*/
lists = text.match(/\{\{(URL)(.*)\}\}/g);
if (lists && lists.length) {
lists.forEach(function(l) {
var tmp = l.replace('{{', '').replace('}}', '').split('|');
text = (tmp && tmp.length > 0) ? text.replace(l, tmp[tmp.length - 1]) : text;
});
}

/*
* Parse Start date
*/
lists = text.match(/\{\{(Start\sdate)(.*)\}\}/g);
if (lists && lists.length) {
lists.forEach(function(l) {
var tmp = l.replace('{{', '').replace('}}', '').split('|');
/* Pop first element: 'Start date' */
tmp.shift();
text = (tmp) ? text.replace(l, tmp.join('/')) : text;
});
}

/*****************************
* Analyze each line of text *
*****************************/

var result = {};
text.split('\n|').forEach(function(item) {
/**
* Extract {item_name, item_content} from each item
*/
var itemIndex = item.indexOf('=');
if (itemIndex != -1) {
var item_name = item.substr(0, itemIndex).trim();
var item_content = item.substr(itemIndex + 1).trim().split('\n')[0];

/*
* Extract all simple texts inside '[[ ]]'
* such as [[France]], [[Language French|French]], etc.
*/
var find = item_content.match(/\[\[.*?\]\]/g);
if (find) {
find.forEach(function(substring) {
var barestring = substring.replace('[[', '').replace(']]', '');
var arr = barestring.split('|');
/**
* TODO: support link.
* Reference: https://en.wikipedia.org/wiki/Help:Wiki_markup#Links_and_URLs
*/
item_content = item_content.replace(substring, arr[arr.length - 1]);
});
}

/*
* Remove font style
* {{fake clarify}}
* {{fake citation needed}}
* {{fake elucidate}}
* {{fake heading}}
* {{fake notes and references}}
* {{dummy ref}}
* {{dummy backlink}}
* {{dummy footnote}}
* {{break}}
* {{break|5}}
* {{clear}}
* {{clear|left}}
* {{clear|right}}
* {{plainlist}}
* {{startflatlist}}
* {{flatlist}}
* {{hlist|first item|second item|third item|...}}
* {{bulleted list |item1 |item2 |...}}
* {{pagelist}}
* {{nowrap}}
* {{italics}}
* {{smallcaps|small caps}}
* {{pad|4.0em}}
*/
while (item_content.indexOf('{{nowrap|') !== -1) {
item_content = item_content.replace('{{nowrap|', '');
item_content = item_content.replace('}}', '');
}

while (item_content.indexOf('{{small|') !== -1) {
item_content = item_content.replace('{{small|', '');
item_content = item_content.replace('}}', '');
}

if (item_content.indexOf('{{native') !== -1) {
find = item_content.match(/\{\{native[^\}\}]*?\}\}/g);
find && find.forEach(function(substring) {
item_content = item_content.replace(substring, substring.split('|')[2]);
});
}

/* Remove simple vertical list tag */
if (item_content.indexOf('{{vunblist') !== -1 &&
item_content.split('{{').length < 3) {

find = item_content.match(/\{\{vunblist[^\}\}]*?\}\}/g);
find && find.forEach(function(substring) {
var tmp = substring.split('|');
tmp.shift();
item_content = item_content.replace(substring, tmp.join(',').replace('}}', ''));
});
}

/* Remove horizon list tag */
if (item_content.indexOf('{{hlist') !== -1) {
find = item_content.match(/\{\{hlist[^\}\}]*?\}\}/g);
find && find.forEach(function(substring) {
var tmp = substring.split('|');
tmp.shift();
item_content = item_content.replace(substring, tmp.join(',').replace('}}', ''));
});
}

/* Remove efn tag */
if (item_content.indexOf('{{efn') !== -1) {
find = item_content.match(/\{\{efn[^\}\}]*?\}\}/g);
find && find.forEach(function(substring) {
item_content = item_content.replace(substring, '');
});
}

item_content = utils.replaceAll('&nbsp', ' ', item_content);
item_content = utils.replaceAll('\n\}\}', '', item_content);
result[item_name] = item_content;
}
});

callback(null, JSON.stringify(result));
return;
};
39 changes: 39 additions & 0 deletions src/parser/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* Collection of utils
* @type {Object}
*/
var utils = {
/**
* Relace all target strings
* @method function
* @param {string} find target snippet
* @param {string} replace new snippet
* @param {string} str original string
* @return {string} new string with new snippet
*/
replaceAll: function(find, replace, str) {
if(str) {
return str.replace(new RegExp(find, 'gm'), replace).trim();
} else {
return null;
}
},

/**
* Valid JSON format
* @method function
* @param {string} text JSON string
* @return {boolean} boolean valid or not
*/
checkJson: function(text) {
if (text && /^[\],:{}\s]*$/.test(text.replace(/\\["\\\/bfnrtu]/g, '@').
replace(/"[^"\\\n\r]*"|true|false|null|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?/g, ']').
replace(/(?:^|:|,)(?:\s*\[)+/g, ''))) {
return true;
} else {
return false;
}
}
};

module.exports = utils;
2 changes: 1 addition & 1 deletion src/wiki.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import 'babel-polyfill';
import fetch from 'isomorphic-fetch';
import _ from 'underscore';
import wikiInfoboxParser from 'wiki-infobox-parser/lib/parser';
import wikiInfoboxParser from './parser';
import querystring from 'querystring';

/**
Expand Down

0 comments on commit ccbb4ad

Please sign in to comment.