-
-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #22 from dijs/parser
Added parser to library so we don't have the request dependency
- Loading branch information
Showing
5 changed files
with
280 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
var utils = require('./utils'); | ||
|
||
/** | ||
* Main markup syndax parser | ||
* @param {string} data markup text | ||
* @param {Function} callback callback function | ||
* @return {undefined} return undefined when error occurs | ||
*/ | ||
module.exports = function(data, callback) { | ||
|
||
var content; | ||
|
||
/********************************** | ||
* Parse scraping result, * | ||
* which is in the format of JSON * | ||
**********************************/ | ||
|
||
try { | ||
content = JSON.parse(data); | ||
} catch(e) { | ||
callback(e); | ||
return; | ||
} | ||
|
||
if (!content.query) { callback(new Error('Query Not Found')); return; } | ||
|
||
/** | ||
* Get JSON data | ||
*/ | ||
var json = content.query.pages; | ||
var key = Object.keys(json); | ||
|
||
if (key.indexOf('-1') === 0) { | ||
callback(new Error('Page Index Not Found')); | ||
return; | ||
} else if(!json[key]){ | ||
callback(new Error('Malformed Response Payload')); | ||
return; | ||
} else if (json[key].revisions[0]['*'].indexOf('REDIRECT') > -1) { | ||
callback(new Error(json[key].revisions[0]['*'])); | ||
return; | ||
} | ||
|
||
/** | ||
* Get the JSON data that contains infobox section | ||
*/ | ||
var reg = new RegExp('{{[Ii]nfobox(.|\n)*}}', 'g'); | ||
var text = reg.exec(json[key].revisions[0]['*']); | ||
if (!text) { callback(new Error('Infobox Not Found')); return; } | ||
text = text[0]; | ||
|
||
|
||
/************************ | ||
* Remove useless marks * | ||
************************/ | ||
|
||
/* | ||
* Remove comments | ||
*/ | ||
text = utils.replaceAll('<!--.*-->', '', text); | ||
/* | ||
* Remove reference | ||
* TODO: support reference in advanced model | ||
*/ | ||
text = utils.replaceAll('<ref.*(/>|>.*</ref>)', '', text); | ||
/* | ||
* Remove all HTML tags like '<br>', etc. | ||
*/ | ||
text = utils.replaceAll('<[^>]+>', '', text); | ||
/* | ||
* Remove footnote | ||
* TODO; support footnote in advanced model | ||
*/ | ||
text = utils.replaceAll('\{\{refn[^\}\}]*?\}\}', '', text); | ||
|
||
/* | ||
* Merge order, bulleted, unbulleted, Pagelist | ||
* list items to one line | ||
*/ | ||
var lists = text.match(/\{\{(order|bulleted|unbulleted|Pagelist)(.*\n)*?\}\}/g); | ||
if (lists && lists.length) { | ||
lists.forEach(function(l) { | ||
text = text.replace(l, l.replace('{{', '').replace('}}', '') | ||
.replace(/(order|bulleted|unbulleted)\slist\n\|/g, '') | ||
.split('\n|').join(', ')); | ||
}); | ||
} | ||
|
||
/* | ||
* Parse URL | ||
*/ | ||
lists = text.match(/\{\{(URL)(.*)\}\}/g); | ||
if (lists && lists.length) { | ||
lists.forEach(function(l) { | ||
var tmp = l.replace('{{', '').replace('}}', '').split('|'); | ||
text = (tmp && tmp.length > 0) ? text.replace(l, tmp[tmp.length - 1]) : text; | ||
}); | ||
} | ||
|
||
/* | ||
* Parse Start date | ||
*/ | ||
lists = text.match(/\{\{(Start\sdate)(.*)\}\}/g); | ||
if (lists && lists.length) { | ||
lists.forEach(function(l) { | ||
var tmp = l.replace('{{', '').replace('}}', '').split('|'); | ||
/* Pop first element: 'Start date' */ | ||
tmp.shift(); | ||
text = (tmp) ? text.replace(l, tmp.join('/')) : text; | ||
}); | ||
} | ||
|
||
/***************************** | ||
* Analyze each line of text * | ||
*****************************/ | ||
|
||
var result = {}; | ||
text.split('\n|').forEach(function(item) { | ||
/** | ||
* Extract {item_name, item_content} from each item | ||
*/ | ||
var itemIndex = item.indexOf('='); | ||
if (itemIndex != -1) { | ||
var item_name = item.substr(0, itemIndex).trim(); | ||
var item_content = item.substr(itemIndex + 1).trim().split('\n')[0]; | ||
|
||
/* | ||
* Extract all simple texts inside '[[ ]]' | ||
* such as [[France]], [[Language French|French]], etc. | ||
*/ | ||
var find = item_content.match(/\[\[.*?\]\]/g); | ||
if (find) { | ||
find.forEach(function(substring) { | ||
var barestring = substring.replace('[[', '').replace(']]', ''); | ||
var arr = barestring.split('|'); | ||
/** | ||
* TODO: support link. | ||
* Reference: https://en.wikipedia.org/wiki/Help:Wiki_markup#Links_and_URLs | ||
*/ | ||
item_content = item_content.replace(substring, arr[arr.length - 1]); | ||
}); | ||
} | ||
|
||
/* | ||
* Remove font style | ||
* {{fake clarify}} | ||
* {{fake citation needed}} | ||
* {{fake elucidate}} | ||
* {{fake heading}} | ||
* {{fake notes and references}} | ||
* {{dummy ref}} | ||
* {{dummy backlink}} | ||
* {{dummy footnote}} | ||
* {{break}} | ||
* {{break|5}} | ||
* {{clear}} | ||
* {{clear|left}} | ||
* {{clear|right}} | ||
* {{plainlist}} | ||
* {{startflatlist}} | ||
* {{flatlist}} | ||
* {{hlist|first item|second item|third item|...}} | ||
* {{bulleted list |item1 |item2 |...}} | ||
* {{pagelist}} | ||
* {{nowrap}} | ||
* {{italics}} | ||
* {{smallcaps|small caps}} | ||
* {{pad|4.0em}} | ||
*/ | ||
while (item_content.indexOf('{{nowrap|') !== -1) { | ||
item_content = item_content.replace('{{nowrap|', ''); | ||
item_content = item_content.replace('}}', ''); | ||
} | ||
|
||
while (item_content.indexOf('{{small|') !== -1) { | ||
item_content = item_content.replace('{{small|', ''); | ||
item_content = item_content.replace('}}', ''); | ||
} | ||
|
||
if (item_content.indexOf('{{native') !== -1) { | ||
find = item_content.match(/\{\{native[^\}\}]*?\}\}/g); | ||
find && find.forEach(function(substring) { | ||
item_content = item_content.replace(substring, substring.split('|')[2]); | ||
}); | ||
} | ||
|
||
/* Remove simple vertical list tag */ | ||
if (item_content.indexOf('{{vunblist') !== -1 && | ||
item_content.split('{{').length < 3) { | ||
|
||
find = item_content.match(/\{\{vunblist[^\}\}]*?\}\}/g); | ||
find && find.forEach(function(substring) { | ||
var tmp = substring.split('|'); | ||
tmp.shift(); | ||
item_content = item_content.replace(substring, tmp.join(',').replace('}}', '')); | ||
}); | ||
} | ||
|
||
/* Remove horizon list tag */ | ||
if (item_content.indexOf('{{hlist') !== -1) { | ||
find = item_content.match(/\{\{hlist[^\}\}]*?\}\}/g); | ||
find && find.forEach(function(substring) { | ||
var tmp = substring.split('|'); | ||
tmp.shift(); | ||
item_content = item_content.replace(substring, tmp.join(',').replace('}}', '')); | ||
}); | ||
} | ||
|
||
/* Remove efn tag */ | ||
if (item_content.indexOf('{{efn') !== -1) { | ||
find = item_content.match(/\{\{efn[^\}\}]*?\}\}/g); | ||
find && find.forEach(function(substring) { | ||
item_content = item_content.replace(substring, ''); | ||
}); | ||
} | ||
|
||
item_content = utils.replaceAll(' ', ' ', item_content); | ||
item_content = utils.replaceAll('\n\}\}', '', item_content); | ||
result[item_name] = item_content; | ||
} | ||
}); | ||
|
||
callback(null, JSON.stringify(result)); | ||
return; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
/** | ||
* Collection of utils | ||
* @type {Object} | ||
*/ | ||
var utils = { | ||
/** | ||
* Relace all target strings | ||
* @method function | ||
* @param {string} find target snippet | ||
* @param {string} replace new snippet | ||
* @param {string} str original string | ||
* @return {string} new string with new snippet | ||
*/ | ||
replaceAll: function(find, replace, str) { | ||
if(str) { | ||
return str.replace(new RegExp(find, 'gm'), replace).trim(); | ||
} else { | ||
return null; | ||
} | ||
}, | ||
|
||
/** | ||
* Valid JSON format | ||
* @method function | ||
* @param {string} text JSON string | ||
* @return {boolean} boolean valid or not | ||
*/ | ||
checkJson: function(text) { | ||
if (text && /^[\],:{}\s]*$/.test(text.replace(/\\["\\\/bfnrtu]/g, '@'). | ||
replace(/"[^"\\\n\r]*"|true|false|null|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?/g, ']'). | ||
replace(/(?:^|:|,)(?:\s*\[)+/g, ''))) { | ||
return true; | ||
} else { | ||
return false; | ||
} | ||
} | ||
}; | ||
|
||
module.exports = utils; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters