Skip to content

Commit

Permalink
Refactor bookmark import using a generic Netscape parser
Browse files Browse the repository at this point in the history
Relates to #607
Relates to #608
Relates to #493 (abandoned)

Additions:
- use Composer's autoload to load 3rd-party dependencies under vendor/

Modifications:
- [import] replace the current parser with a generic, stable parser
  - move code to application/NetscapeBookmarkUtils
  - improve status report after parsing
- [router] use the same endpoint for both bookmark upload and import dialog
- [template] update bookmark import options
  - allow adding tags to all imported links
  - allow selecting the visibility (privacy) of imported links
- [tests] ensure bookmarks are properly parsed and imported in the LinkDB
  - reuse reference input from the parser's test data

See:
- https://github.com/shaarli/netscape-bookmark-parser
- https://getcomposer.org/doc/01-basic-usage.md#autoloading

Signed-off-by: VirtualTam <virtualtam@flibidi.net>
  • Loading branch information
virtualtam authored and ArthurHoaro committed Nov 5, 2016
1 parent 05d8c48 commit d6d8558
Show file tree
Hide file tree
Showing 10 changed files with 779 additions and 122 deletions.
142 changes: 142 additions & 0 deletions application/NetscapeBookmarkUtils.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,146 @@ public static function filterAndFormat($linkDb, $selection, $prependNoteUrl, $in

return $bookmarkLinks;
}

/**
* Generates an import status summary
*
* @param string $filename name of the file to import
* @param int $filesize size of the file to import
* @param int $importCount how many links were imported
* @param int $overwriteCount how many links were overwritten
* @param int $skipCount how many links were skipped
*
* @return string Summary of the bookmark import status
*/
private static function importStatus(
$filename,
$filesize,
$importCount=0,
$overwriteCount=0,
$skipCount=0
)
{
$status = 'File '.$filename.' ('.$filesize.' bytes) ';
if ($importCount == 0 && $overwriteCount == 0 && $skipCount == 0) {
$status .= 'has an unknown file format. Nothing was imported.';
} else {
$status .= 'was successfully processed: '.$importCount.' links imported, ';
$status .= $overwriteCount.' links overwritten, ';
$status .= $skipCount.' links skipped.';
}
return $status;
}

/**
* Imports Web bookmarks from an uploaded Netscape bookmark dump
*
* @param array $post Server $_POST parameters
* @param array $file Server $_FILES parameters
* @param LinkDB $linkDb Loaded LinkDB instance
* @param string $pagecache Page cache
*
* @return string Summary of the bookmark import status
*/
public static function import($post, $files, $linkDb, $pagecache)
{
$filename = $files['filetoupload']['name'];
$filesize = $files['filetoupload']['size'];
$data = file_get_contents($files['filetoupload']['tmp_name']);

// Sniff file type
if (! startsWith($data, '<!DOCTYPE NETSCAPE-Bookmark-file-1>')) {
return self::importStatus($filename, $filesize);
}

// Overwrite existing links?
$overwrite = ! empty($post['overwrite']);

// Add tags to all imported links?
if (empty($post['default_tags'])) {
$defaultTags = array();
} else {
$defaultTags = preg_split(
'/[\s,]+/',
escape($post['default_tags'])
);
}

// links are imported as public by default
$defaultPrivacy = 0;

$parser = new NetscapeBookmarkParser(
true, // nested tag support
$defaultTags, // additional user-specified tags
strval(1 - $defaultPrivacy) // defaultPub = 1 - defaultPrivacy
);
$bookmarks = $parser->parseString($data);

$importCount = 0;
$overwriteCount = 0;
$skipCount = 0;

foreach ($bookmarks as $bkm) {
$private = $defaultPrivacy;
if (empty($post['privacy']) || $post['privacy'] == 'default') {
// use value from the imported file
$private = $bkm['pub'] == '1' ? 0 : 1;
} else if ($post['privacy'] == 'private') {
// all imported links are private
$private = 1;
} else if ($post['privacy'] == 'public') {
// all imported links are public
$private = 0;
}

$newLink = array(
'title' => $bkm['title'],
'url' => $bkm['uri'],
'description' => $bkm['note'],
'private' => $private,
'linkdate'=> '',
'tags' => $bkm['tags']
);

$existingLink = $linkDb->getLinkFromUrl($bkm['uri']);

if ($existingLink !== false) {
if ($overwrite === false) {
// Do not overwrite an existing link
$skipCount++;
continue;
}

// Overwrite an existing link, keep its date
$newLink['linkdate'] = $existingLink['linkdate'];
$linkDb[$existingLink['linkdate']] = $newLink;
$importCount++;
$overwriteCount++;
continue;
}

// Add a new link
$newLinkDate = new DateTime('@'.strval($bkm['time']));
while (!empty($linkDb[$newLinkDate->format(LinkDB::LINK_DATE_FORMAT)])) {
// Ensure the date/time is not already used
// - this hack is necessary as the date/time acts as a primary key
// - apply 1 second increments until an unused index is found
// See https://github.com/shaarli/Shaarli/issues/351
$newLinkDate->add(new DateInterval('PT1S'));
}
$linkDbDate = $newLinkDate->format(LinkDB::LINK_DATE_FORMAT);
$newLink['linkdate'] = $linkDbDate;
$linkDb[$linkDbDate] = $newLink;
$importCount++;
}

$linkDb->savedb($pagecache);
return self::importStatus(
$filename,
$filesize,
$importCount,
$overwriteCount,
$skipCount
);
}
}
139 changes: 32 additions & 107 deletions index.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@
//error_reporting(-1);


// 3rd-party libraries
require_once 'inc/rain.tpl.class.php';
require_once __DIR__ . '/vendor/autoload.php';

// Shaarli library
require_once 'application/ApplicationUtils.php';
require_once 'application/Cache.php';
Expand All @@ -65,7 +69,6 @@
require_once 'application/PluginManager.php';
require_once 'application/Router.php';
require_once 'application/Updater.php';
require_once 'inc/rain.tpl.class.php';

// Ensure the PHP version is supported
try {
Expand Down Expand Up @@ -1468,26 +1471,37 @@ function renderPage($conf, $pluginManager)
exit;
}

// -------- User is uploading a file for import
if (isset($_SERVER['QUERY_STRING']) && startsWith($_SERVER['QUERY_STRING'], 'do=upload'))
{
// If file is too big, some form field may be missing.
if (!isset($_POST['token']) || (!isset($_FILES)) || (isset($_FILES['filetoupload']['size']) && $_FILES['filetoupload']['size']==0))
{
$returnurl = ( empty($_SERVER['HTTP_REFERER']) ? '?' : $_SERVER['HTTP_REFERER'] );
echo '<script>alert("The file you are trying to upload is probably bigger than what this webserver can accept ('.getMaxFileSize().' bytes). Please upload in smaller chunks.");document.location=\''.escape($returnurl).'\';</script>';
if ($targetPage == Router::$PAGE_IMPORT) {
// Upload a Netscape bookmark dump to import its contents

if (! isset($_POST['token']) || ! isset($_FILES['filetoupload'])) {
// Show import dialog
$PAGE->assign('maxfilesize', getMaxFileSize());
$PAGE->renderPage('import');
exit;
}
if (!tokenOk($_POST['token'])) die('Wrong token.');
importFile($LINKSDB);
exit;
}

// -------- Show upload/import dialog:
if ($targetPage == Router::$PAGE_IMPORT)
{
$PAGE->assign('maxfilesize',getMaxFileSize());
$PAGE->renderPage('import');
// Import bookmarks from an uploaded file
if (isset($_FILES['filetoupload']['size']) && $_FILES['filetoupload']['size'] == 0) {
// The file is too big or some form field may be missing.
echo '<script>alert("The file you are trying to upload is probably'
.' bigger than what this webserver can accept ('
.getMaxFileSize().' bytes).'
.' Please upload in smaller chunks.");document.location=\'?do='
.Router::$PAGE_IMPORT .'\';</script>';
exit;
}
if (! tokenOk($_POST['token'])) {
die('Wrong token.');
}
$status = NetscapeBookmarkUtils::import(
$_POST,
$_FILES,
$LINKSDB,
$conf->get('resource.page_cache')
);
echo '<script>alert("'.$status.'");document.location=\'?do='
.Router::$PAGE_IMPORT .'\';</script>';
exit;
}

Expand Down Expand Up @@ -1544,95 +1558,6 @@ function($a, $b) { return $a['order'] - $b['order']; }
exit;
}

/**
* Process the import file form.
*
* @param LinkDB $LINKSDB Loaded LinkDB instance.
* @param ConfigManager $conf Configuration Manager instance.
*/
function importFile($LINKSDB, $conf)
{
if (!isLoggedIn()) { die('Not allowed.'); }

$filename=$_FILES['filetoupload']['name'];
$filesize=$_FILES['filetoupload']['size'];
$data=file_get_contents($_FILES['filetoupload']['tmp_name']);
$private = (empty($_POST['private']) ? 0 : 1); // Should the links be imported as private?
$overwrite = !empty($_POST['overwrite']) ; // Should the imported links overwrite existing ones?
$import_count=0;

// Sniff file type:
$type='unknown';
if (startsWith($data,'<!DOCTYPE NETSCAPE-Bookmark-file-1>')) $type='netscape'; // Netscape bookmark file (aka Firefox).

// Then import the bookmarks.
if ($type=='netscape')
{
// This is a standard Netscape-style bookmark file.
// This format is supported by all browsers (except IE, of course), also Delicious, Diigo and others.
foreach(explode('<DT>',$data) as $html) // explode is very fast
{
$link = array('linkdate'=>'','title'=>'','url'=>'','description'=>'','tags'=>'','private'=>0);
$d = explode('<DD>',$html);
if (startsWith($d[0], '<A '))
{
$link['description'] = (isset($d[1]) ? html_entity_decode(trim($d[1]),ENT_QUOTES,'UTF-8') : ''); // Get description (optional)
preg_match('!<A .*?>(.*?)</A>!i',$d[0],$matches); $link['title'] = (isset($matches[1]) ? trim($matches[1]) : ''); // Get title
$link['title'] = html_entity_decode($link['title'],ENT_QUOTES,'UTF-8');
preg_match_all('! ([A-Z_]+)=\"(.*?)"!i',$html,$matches,PREG_SET_ORDER); // Get all other attributes
$raw_add_date=0;
foreach($matches as $m)
{
$attr=$m[1]; $value=$m[2];
if ($attr=='HREF') $link['url']=html_entity_decode($value,ENT_QUOTES,'UTF-8');
elseif ($attr=='ADD_DATE')
{
$raw_add_date=intval($value);
if ($raw_add_date>30000000000) $raw_add_date/=1000; //If larger than year 2920, then was likely stored in milliseconds instead of seconds
}
elseif ($attr=='PRIVATE') $link['private']=($value=='0'?0:1);
elseif ($attr=='TAGS') $link['tags']=html_entity_decode(str_replace(',',' ',$value),ENT_QUOTES,'UTF-8');
}
if ($link['url']!='')
{
if ($private==1) $link['private']=1;
$dblink = $LINKSDB->getLinkFromUrl($link['url']); // See if the link is already in database.
if ($dblink==false)
{ // Link not in database, let's import it...
if (empty($raw_add_date)) $raw_add_date=time(); // In case of shitty bookmark file with no ADD_DATE

// Make sure date/time is not already used by another link.
// (Some bookmark files have several different links with the same ADD_DATE)
// We increment date by 1 second until we find a date which is not used in DB.
// (so that links that have the same date/time are more or less kept grouped by date, but do not conflict.)
while (!empty($LINKSDB[date('Ymd_His',$raw_add_date)])) { $raw_add_date++; }// Yes, I know it's ugly.
$link['linkdate']=date('Ymd_His',$raw_add_date);
$LINKSDB[$link['linkdate']] = $link;
$import_count++;
}
else // Link already present in database.
{
if ($overwrite)
{ // If overwrite is required, we import link data, except date/time.
$link['linkdate']=$dblink['linkdate'];
$LINKSDB[$link['linkdate']] = $link;
$import_count++;
}
}

}
}
}
$LINKSDB->savedb($conf->get('resource.page_cache'));

echo '<script>alert("File '.json_encode($filename).' ('.$filesize.' bytes) was successfully processed: '.$import_count.' links imported.");document.location=\'?\';</script>';
}
else
{
echo '<script>alert("File '.json_encode($filename).' ('.$filesize.' bytes) has an unknown file format. Nothing was imported.");document.location=\'?\';</script>';
}
}

/**
* Template for the list of links (<div id="linklist">)
* This function fills all the necessary fields in the $PAGE for the template 'linklist.html'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
require_once 'application/NetscapeBookmarkUtils.php';

/**
* Netscape bookmark import and export
* Netscape bookmark export
*/
class NetscapeBookmarkUtilsTest extends PHPUnit_Framework_TestCase
class BookmarkExportTest extends PHPUnit_Framework_TestCase
{
/**
* @var string datastore to test write operations
Expand Down
Loading

0 comments on commit d6d8558

Please sign in to comment.