-
-
Notifications
You must be signed in to change notification settings - Fork 31
/
zimArchive.js
1005 lines (970 loc) · 48.2 KB
/
zimArchive.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* zimArchive.js: Support for archives in ZIM format.
*
* Copyright 2015-2023 Mossroy, Jaifroid and contributors
* Licence GPL v3:
*
* This file is part of Kiwix.
*
* Kiwix is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public Licence as published by
* the Free Software Foundation, either version 3 of the Licence, or
* (at your option) any later version.
*
* Kiwix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public Licence
* along with Kiwix (file LICENSE-GPLv3.txt). If not, see <http://www.gnu.org/licenses/>
*/
'use strict';
/* global params, appstate */
import zimfile from './zimfile.js';
import zimDirEntry from './zimDirEntry.js';
import transformZimit from './transformZimit.js';
import util from './util.js';
import uiUtil from './uiUtil.js';
import utf8 from './utf8.js';
/**
* ZIM Archive
*
* @typedef ZIMArchive
* @property {ZIMFile} file The ZIM file (instance of ZIMFile, that might physically be split into several actual _files)
* @property {String} counter Counter of various types of content in the archive
* @property {String} creator Creator of the content
* @property {String} date Date of the creation of the archive
* @property {String} description Description of the content
* @property {String} language Language of the content
* @property {String} name Name of the archive
* @property {String} publisher Publisher of the content
* @property {String} title Title of the content
* @property {String} zimType Extended property: currently either 'open' for OpenZIM file type, or 'zimit' for the warc2zim file type used by Zimit
*/
/**
* @callback callbackZIMArchive
* @param {ZIMArchive} zimArchive Ready-to-use ZIMArchive
*/
/**
* @callback callbackMetadata
* @param {String} data metadata string
*/
/**
* @param {Worker} LZ A Web Worker to run the libzim Web Assembly binary
*/
var LZ;
/**
* Creates a ZIM archive object to access the ZIM file at the given path in the given storage.
* This constructor can also be used with a single File parameter.
*
* @param {StorageFirefoxOS|Array<Blob>} storage Storage (in this case, the path must be given) or Array of Files (path parameter must be omitted)
* @param {String} path The Storage path for an OS that requires this to be specified
* @param {callbackZIMArchive} callbackReady The function to call when the archive is ready to use
* @param {callbackZIMArchive} callbackError The function to call when an error occurs
*/
function ZIMArchive (storage, path, callbackReady, callbackError) {
var that = this;
that.file = null;
var whenZimReady = function () {
// Add time-critical metadata from the M/ namespace that you need early access to here
// Note that adding metadata here delays the reporting of the ZIM archive as ready
// Further metadata are added in the background below, and can be accessed later
return Promise.all([
that.addMetadataToZIMFile('Creator'),
that.addMetadataToZIMFile('Language'),
that.addMetadataToZIMFile('Publisher')
]).then(function () {
console.debug('ZIMArchive ready, metadata will be added in the background');
// Add non-time-critical metadata to archive in background so as not to delay opening of the archive
// DEV: Note that it does not make sense to extract illustration (icon) metadata here. Instead, if you implement use of the illustration
// metadata as icons for the loaded ZIM [kiwix-js #886], you should simply use the ZIMArdhive.getMetadata() function when needed
setTimeout(function () {
Promise.all([
that.addMetadataToZIMFile('Counter'),
that.addMetadataToZIMFile('Date'),
that.addMetadataToZIMFile('Description'),
that.addMetadataToZIMFile('Name'),
that.addMetadataToZIMFile('Source'),
that.addMetadataToZIMFile('Title')
]).then(function () {
console.debug('ZIMArchive metadata loaded:', that);
});
}, 1000); // DEV: If you need any of the above earlier, you can alter this delay
// We need to get the landing page of any Zimit archive opened
// Note that the test below catches both zimit and zimit2 types
if (/zimit/.test(that.zimType)) {
return that.setZimitMetadata().then(function () {
callbackReady(that);
});
} else {
// All listings should be loaded, so we can now call the callback
callbackReady(that);
}
});
};
var createZimfile = function (fileArray) {
return zimfile.fromFileArray(fileArray).then(function (file) {
that.file = file;
// Clear the previous libzimWoker
LZ = null;
// Set a global parameter to report the search provider type
params.searchProvider = 'title';
// File has been created, but we need to add any Listings which extend the archive metadata
return that.file.setListings([
// Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file
// 'ptrName' and 'countName' contain the key names to be set in the archive file object
{
// This defines the standard v0 (legacy) title index that contains listings for every entry in the ZIM (not just articles)
// It represents the same index that is referenced in the ZIM archive header
path: 'X/listing/titleOrdered/v0',
ptrName: 'titlePtrPos',
countName: 'entryCount'
},
{
// This defines a new version 1 index that is present in no-namespace ZIMs, and contains a title-ordered list of articles
path: 'X/listing/titleOrdered/v1',
ptrName: 'articlePtrPos',
countName: 'articleCount'
},
{
// This tests for and specifies the existence of any Xapian Full Text Index
path: 'X/fulltext/xapian',
ptrName: 'fullTextIndex',
countName: 'fullTextIndexSize'
}
]).then(function () {
that.libzimReady = null;
// There is currently an exception thrown in the libzim wasm if we attempt to load a split ZIM archive, so we work around
var isSplitZim = /\.zima.$/i.test(that.file._files[0].name);
var libzimReaderType = params.debugLibzimASM || ('WebAssembly' in self ? 'wasm' : 'asm');
if ((that.file.fullTextIndex || params.useLibzim) && params.debugLibzimASM !== 'disable' && (params.debugLibzimASM || !isSplitZim &&
// The ASM implementation requires Atomics support, whereas the WASM implementation does not
(typeof Atomics !== 'undefined' || libzimReaderType === 'wasm') &&
// Note that NWJS currently throws due to problems with Web Worker context
!(window.nw && that.file._files[0].readMode === 'electron'))) {
that.libzimReady = 'loading';
console.log('Instantiating libzim ' + libzimReaderType + ' Web Worker...');
if (params.useLibzim) uiUtil.pollSpinner('Waiting for libzim...', true);
LZ = new Worker('js/lib/libzim-' + libzimReaderType + '.js');
that.callLibzimWorker({ action: 'init', files: that.file._files }).then(function () {
that.libzimReady = 'ready';
// If user is using libzim for reading the file, we have delayed the callback till now
if (params.useLibzim) whenZimReady();
params.searchProvider = 'fulltext: ' + libzimReaderType;
// Update the API panel
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider);
}).catch(function (err) {
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider + ': ERROR');
console.error('The libzim worker could not be instantiated!', err);
that.libzimReady = 'error';
});
} else {
// var message = 'Full text searching is not available because ';
if (!that.file.fullTextIndex) {
params.searchProvider += ': no_fulltext'; // message += 'this ZIM does not have a full-text index.';
} else if (isSplitZim) {
params.searchProvider += ': split_zim'; // message += 'the ZIM archive is split.';
} else if (typeof Atomics === 'undefined') {
params.searchProvider += ': no_atomics'; // message += 'this browser does not support Atomic operations.';
} else if (/Android/.test(params.appType)) {
params.searchProvider += ': no_sharedArrayBuffer';
} else if (params.debugLibzimASM === 'disable') {
params.searchProvider += ': disabled';
} else {
params.searchProvider += ': unknown';
}
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider);
}
// Set the archive file type ('open', 'zimit' or 'zimit2')
return that.addMetadataToZIMFile('Scraper').then(function () {
params.zimType = that.setZimType();
// If user is not using libzim for reading the file, we can call the ready callback now
if (!params.useLibzim) whenZimReady();
});
}).catch(function (err) {
console.warn('Error setting archive listings: ', err);
});
});
};
if (storage && !path) {
var fileList = storage;
// We need to convert the FileList into an Array
var fileArray = [].slice.call(fileList);
// The constructor has been called with an array of File/Blob parameter
createZimfile(fileArray);
} else {
if (/.*zim..$/.test(path)) {
// split archive
that._searchArchiveParts(storage, path.slice(0, -2)).then(function (fileArray) {
createZimfile(fileArray);
}).catch(function (error) {
callbackError('Error reading files in split archive ' + path + ': ' + error, 'Error reading archive files');
});
} else {
storage.get(path).then(function (file) {
createZimfile([file]);
}).catch(function (error) {
callbackError('Error reading ZIM file ' + path + ' : ' + error, 'Error reading archive file');
});
}
}
}
/**
* Searches the directory for all parts of a split archive.
* @param {Storage} storage storage interface
* @param {String} prefixPath path to the split files, missing the "aa" / "ab" / ... suffix.
* @returns {Promise} that resolves to the array of file objects found.
*/
ZIMArchive.prototype._searchArchiveParts = function (storage, prefixPath) {
var fileArray = [];
var nextFile = function (part) {
var suffix = String.fromCharCode(0x61 + Math.floor(part / 26)) + String.fromCharCode(0x61 + part % 26);
return storage.get(prefixPath + suffix)
.then(function (file) {
fileArray.push(file);
return nextFile(part + 1);
}, function (error) {
console.error('Error reading split archive file ' + prefixPath + suffix + ': ', error);
return fileArray;
});
};
return nextFile(0);
};
/**
*
* @returns {Boolean}
*/
ZIMArchive.prototype.isReady = function () {
return this.file !== null;
};
/**
* Detects whether the supplied archive is a Zimit-style archive or an OpenZIM archive and
* sets a zimType property accordingly; also returns the detected type. Extends ZIMArchive.
* @returns {String} 'zimit' for a classic Zimit archive, 'zimit2' for a zimit2 archive, or 'open' for an OpenZIM archive
*/
ZIMArchive.prototype.setZimType = function () {
var archiveType = null;
if (this.isReady()) {
archiveType = 'open';
this.file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) archiveType = 'zimit';
});
if (archiveType !== 'zimit' && this.scraper) {
// Check if it's a zimit2 type archive by seeing if the scraper contains 'warc2zim'
archiveType = /warc2zim|zimit/i.test(this.scraper) ? 'zimit2' : archiveType;
}
this.zimType = archiveType;
console.debug('Archive type set to: ' + archiveType);
} else {
console.error('ZIMArchive is not ready! Cannot set ZIM type.');
}
return archiveType;
};
/**
* Looks for the DirEntry of the main page
* @param {callbackDirEntry} callback
* @returns {Promise} that resolves to the DirEntry
*/
ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
if (this.isReady()) {
var mainPageUrlIndex = this.file.mainPage;
var that = this;
this.file.dirEntryByUrlIndex(mainPageUrlIndex).then(function (dirEntry) {
// Filter out Zimit files that we cannot handle without error
if (that.zimType === 'zimit' && !appstate.isReplayWorkerAvailable) dirEntry = transformZimit.filterReplayFiles(dirEntry);
callback(dirEntry);
});
}
};
/**
*
* @param {String} dirEntryId
* @returns {DirEntry}
*/
ZIMArchive.prototype.parseDirEntryId = function (dirEntryId) {
return zimDirEntry.DirEntry.fromStringId(this.file, dirEntryId);
};
/**
* @callback callbackDirEntryList
* @param {Array.<DirEntry>} dirEntryArray Array of DirEntries found
*/
/**
* Look for DirEntries with title starting with the prefix of the current search object.
* For now, ZIM titles are case sensitive.
* So, as workaround, we try several variants of the prefix to find more results.
* This should be enhanced when the ZIM format will be modified to store normalized titles
* See https://phabricator.wikimedia.org/T108536
*
* @param {Object} search The current appstate.search object
* @param {callbackDirEntryList} callback The function to call with the result
* @param {Boolean} noInterim A flag to prevent callback until all results are ready (used in testing)
*/
ZIMArchive.prototype.findDirEntriesWithPrefix = function (search, callback, noInterim) {
var that = this;
// Establish array of initial values that must be searched first. All of these patterns are generated by the full
// search type, and some by basic, but we need the most common patterns to be searched first, as it returns search
// results much more quickly if we do this (and the user can click on a result before the rarer patterns complete)
// NB duplicates are removed before processing search array
var startArray = [];
var cns = this.getContentNamespace();
var dirEntries = [];
search.scanCount = 0;
// Check if user prefixed search with a namespace-like pattern. If so, do a search for namespace + url
if (/^[-ABCHIJMUVWX]\//.test(search.prefix)) search.searchUrlIndex = true;
// Regex below breaks the string into the pattern: group 1: alphanumericsearch; group 2: regex beginning with .* or .+, or contained in (?:regex)
var isPrefixRegExp = search.prefix.match(/^((?:[^(.]|\((?!\?:)|\.(?![*+]))*)(\(\?:.*\)|\.[*+].*)$/);
search.rgxPrefix = null;
var prefix = search.prefix;
// Launch a full-text search if possible
if (LZ && !search.searchUrlIndex) {
that.findDirEntriesFromFullTextSearch(search, dirEntries).then(function (fullTextDirEntries) {
// If user initiated a new search, cancel this one
// In particular, do not set the search status back to 'complete'
// as that would cause outdated results to unexpectedly pop up
if (search.status === 'cancelled') return callback([], search);
dirEntries = fullTextDirEntries;
search.status = 'complete';
callback(dirEntries, search);
});
}
if (isPrefixRegExp) {
// User has initiated a regular expression search - note the only regexp special character allowed in the alphanumeric part is \s
prefix = isPrefixRegExp[1].replace(/\\s/g, ' ');
var regexCorrect = true;
try {
search.rgxPrefix = new RegExp(isPrefixRegExp[2], 'i');
} catch (err) {
// User has incorrect regular expression syntax
regexCorrect = false;
}
if (!regexCorrect) {
search.status = 'error';
callback([], search);
return;
}
}
var prefixNameSpaces = '';
if (search.searchUrlIndex) {
var rgxSplitPrefix = /^[-ABCHIJMUVWX]\//;
if (that.zimType === 'zimit' && cns === 'C') {
// We have to account for the Zimit prefix in Type 1 ZIMs
rgxSplitPrefix = /^(?:[CMWX]\/)?(?:[AH]\/)?/;
}
var splitPrefix = prefix.match(rgxSplitPrefix);
prefixNameSpaces = splitPrefix ? splitPrefix[0] : '';
var splitSuffix = prefix.split(rgxSplitPrefix);
prefix = splitSuffix ? splitSuffix[1] : prefix;
}
// Ensure a search is done on the string exactly as typed
startArray.push(prefix);
// Normalize any spacing and make string all lowercase
prefix = prefix.replace(/\s+/g, ' ').toLocaleLowerCase();
// Add lowercase string with initial uppercase (this is a very common pattern)
startArray.push(prefix.replace(/^./, function (m) {
return m.toLocaleUpperCase();
}));
// Add pure lowercase string (rarer)
startArray.push(prefix);
// Add a case-insensitive search for the string (pseudo-regex notation)
startArray.push('/' + prefix + '/i');
// Get the full array of combinations to check number of combinations
var fullCombos = util.removeDuplicateStringsInSmallArray(util.allCaseFirstLetters(prefix, 'full'));
// Put cap on exponential number of combinations (five words = 3^5 = 243 combinations)
search.type = fullCombos.length < 300 ? 'full' : 'basic';
// We have to remove duplicate string combinations because util.allCaseFirstLetters() can return some combinations
// where uppercase and lowercase combinations are exactly the same, e.g. where prefix begins with punctuation
// or currency signs, for languages without case, or where user-entered case duplicates calculated case
var prefixVariants = util.removeDuplicateStringsInSmallArray(
startArray.concat(
// Get basic combinations first for speed of returning results
util.allCaseFirstLetters(prefix).concat(
search.type === 'full' ? fullCombos : []
)
)
);
function searchNextVariant () {
// If user has initiated a new search, cancel this one
if (search.status === 'cancelled') return callback([], search);
var remaining = search.size - dirEntries.length;
if (prefixVariants.length === 0 || remaining < 1) {
// We have found all the title-search entries we are going to get, so indicate search type if we're still searching
if (LZ && !search.searchUrlIndex && search.status !== 'complete') search.type = 'fulltext';
else if (LZ && search.searchUrlIndex && remaining > 0) {
search.type = 'fulltext';
that.findDirEntriesFromFullTextSearch(search, dirEntries, remaining).then(function (fullTextDirEntries) {
if (search.status === 'cancelled') return callback([], search);
dirEntries = fullTextDirEntries;
search.status = 'complete';
callback(dirEntries, search);
});
} else search.status = 'complete';
return callback(dirEntries, search);
}
// Dynamically populate list of articles
search.status = 'interim';
if (!noInterim) callback(dirEntries, search);
search.found = dirEntries.length;
var prefix = prefixNameSpaces + prefixVariants[0];
search.lc = false;
// If it's pseudo-regex with a case-insensitive flag like '/my search/i', do an enhanced case-insensitive search
if (/^\/.+\/i$/.test(prefixVariants[0])) {
search.lc = true;
prefix = prefixNameSpaces + prefixVariants[0].replace(/^\/(.+)\/i/, '$1').toLocaleLowerCase();
console.debug('Searching case-insensitively for: "' + prefix + '"');
}
// Remove in-progress search variant from array
prefixVariants = prefixVariants.slice(1);
// Search window sets an upper limit on how many matching dirEntries will be scanned in a full index search
search.window = search.rgxPrefix ? 10000 * search.size : search.size;
that.findDirEntriesWithPrefixCaseSensitive(prefix, search,
function (newDirEntries, countReport, interim) {
search.countReport = countReport;
if (search.status === 'cancelled') return callback([], search);
if (!noInterim && countReport === true) return callback(dirEntries, search);
// Only push interim results to the dirEntries array (otherwise we get a duplicated array when the final results are reported to this function)
if (interim) {
// Collect all the found paths for the dirEntries so far
var dirEntryPaths = [];
for (var i = 0; i < dirEntries.length; i++) {
dirEntryPaths.push(dirEntries[i].url);
}
// Push new directory entries to the end of the global array so long as they are not duplicates
for (var j = 0; j < newDirEntries.length; j++) {
if (~dirEntryPaths.indexOf(newDirEntries[j].url)) continue;
dirEntries.push(newDirEntries[j]);
}
search.found = dirEntries.length;
if (!noInterim && newDirEntries.length) return callback(dirEntries, search);
} else return searchNextVariant();
}
);
}
searchNextVariant();
};
/**
* A method to return the namespace in the ZIM file that contains the primary user content. In old-format ZIM files (minor
* version 0) there are a number of content namespaces, but the primary one in which to search for titles is 'A'. In new-format
* ZIMs (minor version 1) there is a single content namespace 'C'. See https://openzim.org/wiki/ZIM_file_format. This method
* throws an error if it cannot determine the namespace or if the ZIM is not ready.
* @returns {String} The content namespace for the ZIM archive
*/
ZIMArchive.prototype.getContentNamespace = function () {
if (this.isReady()) {
var ver = this.file.minorVersion;
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
// If this changes, adapt the error checking and return values
if (ver > 2) {
console.error('Unknown ZIM minor version: ' + ver + '! Assuming content namespace is C.');
}
return ver === 0 ? 'A' : 'C';
} else {
throw new Error('We could not determine the content namespace because the ZIM file is not ready!');
}
};
/**
* Look for dirEntries with title starting with the given prefix (case-sensitive)
*
* @param {String} prefix The case-sensitive value against which dirEntry titles (or url) will be compared
* @param {Object} search The appstate.search object (for comparison, so that we can cancel long binary searches)
* @param {callbackDirEntryList} callback The function to call with the array of dirEntries with titles that begin with prefix
* @param {Integer} startIndex The index number with which to commence the search, or null
*/
ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, search, callback, startIndex) {
// Save the value of startIndex because value of null has a special meaning in combination with prefix:
// produces a list of matches starting with first match and then next x dirEntries thereafter
var saveStartIndex = startIndex;
startIndex = startIndex || 0;
prefix = prefix || '';
var cns = this.getContentNamespace();
// Search v1 article listing if available, otherwise fallback to v0
var articleCount = this.file.articleCount || this.file.entryCount;
var searchFunction = appstate.selectedArchive.file.dirEntryByTitleIndex;
if (search.searchUrlIndex) {
articleCount = this.file.entryCount;
searchFunction = appstate.selectedArchive.file.dirEntryByUrlIndex;
}
util.binarySearch(startIndex, articleCount, function (i) {
return searchFunction(i).then(function (dirEntry) {
if (search.status === 'cancelled') return 0;
var ns = dirEntry.namespace;
var ti = search.searchUrlIndex ? dirEntry.url : dirEntry.getTitleOrUrl();
if (!search.searchUrlIndex) {
// DEV: This search is redundant if we managed to populate articlePtrLst and articleCount, but it only takes two instructions and
// provides maximum compatibility with rare ZIMs where attempts to find first and last article (in zimArchive.js) may have failed
if (ns < cns) return 1;
if (ns > cns) return -1;
// We should now be in namespace A (old format ZIM) or C (new format ZIM)
if (search.lc) { // Search comparator should be lowercase (for case-insensitive search)
ti = ti.toLocaleLowerCase();
prefix = prefix.toLocaleLowerCase();
}
return prefix <= ti ? -1 : 1;
} else {
if (search.lc) { // Search comparator should be lowercase (for case-insensitive search)
ns = ns + '/' + ti.replace(/^((?:[AH])?)\/?.*/, '$1');
ti = ti.replace(/^[AH]\//, '').toLocaleLowerCase();
}
// if (search.rgxPrefix && search.rgxPrefix.test(ti)) return -1;
return prefix <= (ns + '/' + ti) ? -1 : 1;
}
});
}, true).then(function (firstIndex) {
var vDirEntries = [];
var addDirEntries = function (index, lastTitle) {
if (search.status === 'cancelled' || search.found >= search.size || index >= articleCount ||
lastTitle && !~lastTitle.indexOf(prefix) || index - firstIndex >= search.window) {
// DEV: Diagnostics to be removed before merge
if (vDirEntries.length) {
console.debug('Scanned ' + (index - firstIndex) + ' titles for "' + prefix +
'" (found ' + vDirEntries.length + ' match' + (vDirEntries.length === 1 ? ')' : 'es)'));
}
return {
dirEntries: vDirEntries,
nextStart: index
};
}
return searchFunction(index).then(function (dirEntry) {
search.scanCount++;
var title = dirEntry.getTitleOrUrl();
// If we are searching by URL, display namespace also
if (search.searchUrlIndex) title = dirEntry.namespace + '/' + dirEntry.url;
if (search.lc && !search.rgxPrefix) { // Search comparator should be lowercase if not using regex (for case-insensitive search)
var ns = title.replace(/^((?:C\/)?(?:[AH]\/)?).*/, '$1');
title = ns + title.replace(ns, '').toLocaleLowerCase();
}
// Only return dirEntries with titles that actually begin with prefix
if (saveStartIndex === null || (search.searchUrlIndex || dirEntry.namespace === cns) && title.indexOf(prefix) === 0) {
if (!search.rgxPrefix || search.rgxPrefix && search.rgxPrefix.test(title)) { // Regex test case-insensitive if i flag set
vDirEntries.push(dirEntry);
// Report interim result
if (typeof saveStartIndex === 'undefined') callback([dirEntry], false, true);
}
}
// Report number of titles scanned every 5000 titles
if (!(search.scanCount % 5000) && typeof saveStartIndex === 'undefined') callback([], true, true);
return addDirEntries(index + 1, title);
});
};
return addDirEntries(firstIndex);
}).then(function (objWithIndex) {
return callback(objWithIndex.dirEntries, objWithIndex.nextStart);
});
};
/**
* Find Directory Entries corresponding to the requested search using Full Text search provided by libzim
*
* @param {Object} search The appstate.search object
* @param {Array} dirEntries The array of already found Directory Entries
* @param {Integer} number Optional positive number of search results requested (otherwise params.maxSearchResults will be used)
* @returns {Promise<callbackDirEntry>} The augmented array of Directory Entries with titles that correspond to search
*/
ZIMArchive.prototype.findDirEntriesFromFullTextSearch = function (search, dirEntries, number) {
var cns = this.getContentNamespace();
var that = this;
// We give ourselves an overhead in caclulating the results needed, because full-text search will return some results already found
// var resultsNeeded = Math.floor(params.maxSearchResultsSize - dirEntries.length / 2);
var resultsNeeded = number || params.maxSearchResultsSize;
return this.callLibzimWorker({ action: 'search', text: search.prefix, numResults: resultsNeeded }).then(function (results) {
if (results) {
var dirEntryPaths = [];
var fullTextPaths = [];
// Collect all the found paths for the dirEntries
for (var i = 0; i < dirEntries.length; i++) {
dirEntryPaths.push(dirEntries[i].namespace + '/' + dirEntries[i].url);
}
// Collect all the paths for full text search, pruning as we go
var path;
for (var j = 0; j < results.entries.length; j++) {
search.scanCount++;
path = results.entries[j].path;
// Full-text search result paths are missing the namespace in Type 1 ZIMs, so we add it back
path = cns === 'C' ? cns + '/' + path : path;
if (~dirEntryPaths.indexOf(path)) continue;
fullTextPaths.push(path);
}
var promisesForDirEntries = [];
for (var k = 0; k < fullTextPaths.length; k++) {
promisesForDirEntries.push(that.getDirEntryByPath(fullTextPaths[k]));
}
return Promise.all(promisesForDirEntries).then(function (fullTextDirEntries) {
for (var l = 0; l < fullTextDirEntries.length; l++) {
dirEntries.push(fullTextDirEntries[l]);
}
return dirEntries;
});
} else {
return dirEntries;
}
});
};
/**
* Calls the libzim Web Worker with the given parameters, and returns a Promise with its response
*
* @param {Object} parameters
* @returns {Promise}
*/
ZIMArchive.prototype.callLibzimWorker = function (parameters) {
return new Promise(function (resolve, reject) {
console.debug('Calling libzim WebWorker with parameters', parameters);
var tmpMessageChannel = new MessageChannel();
// var t0 = performance.now();
tmpMessageChannel.port1.onmessage = function (event) {
// var t1 = performance.now();
// var readTime = Math.round(t1 - t0);
// console.debug("Response given by the WebWorker in " + readTime + " ms", event.data);
resolve(event.data);
};
tmpMessageChannel.port1.onerror = function (event) {
// var t1 = performance.now();
// var readTime = Math.round(t1 - t0);
// console.error("Error sent by the WebWorker in " + readTime + " ms", event.data);
reject(event.data);
};
LZ.postMessage(parameters, [tmpMessageChannel.port2]);
});
};
/**
* @callback callbackDirEntry
* @param {DirEntry} dirEntry The DirEntry found
*/
/**
*
* @param {DirEntry} dirEntry
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.resolveRedirect = function (dirEntry, callback) {
var that = this;
this.file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(function (resolvedDirEntry) {
if (that.zimType === 'zimit' && !appstate.isReplayWorkerAvailable) resolvedDirEntry = transformZimit.filterReplayFiles(resolvedDirEntry);
callback(resolvedDirEntry);
});
};
/**
* @callback callbackStringContent
* @param {String} content String content
*/
/**
*
* @param {DirEntry} dirEntry
* @param {callbackStringContent} callback
*/
ZIMArchive.prototype.readUtf8File = function (dirEntry, callback) {
if (params.isLandingPage && appstate.selectedArchive.zimType === 'zimit' && !appstate.isReplayWorkerAvailable && dirEntry.namespace !== 'M') {
// Mark the directory entry as a redirect
dirEntry.zimitRedirect = this.zimitStartPage;
// Prevent reload loop!
params.isLandingPage = false;
}
var that = this || appstate.selectedArchive;
if (!dirEntry) {
console.warn('No directory entry found for requested URL!');
return callback(dirEntry, '');
}
var cns = that.getContentNamespace();
return dirEntry.readData().then(function (data) {
var mimetype = dirEntry.getMimetype();
var html = that.getUtf8FromData(data);
// Bypass everything if we're using Replay Worker
if (appstate.isReplayWorkerAvailable) {
callback(dirEntry, html);
return;
}
if (/\bx?html\b/i.test(mimetype)) {
// If the data were encoded with a different mimtype, here is how to change it
// var encoding = decData.match(/<meta\b[^>]+?Content-Type[^>]+?charset=([^'"\s]+)/i);
// encoding = encoding ? encoding[1] : '';
// if (encoding && !/utf-8/i.test(encoding)) decData = new TextDecoder(encoding).decode(data);
// Some Zimit assets have moved location and we need to follow the moved permanently data
if (/301\s*moved\s+permanently/i.test(html)) dirEntry = transformZimit.getZimitRedirect(dirEntry, html, cns);
// Some Zimit archives have an incorrect meta charset tag. See https://github.com/openzim/warc2zim/issues/88.
// So we remove it!
html = html.replace(/<meta\b[^>]+?Content-Type[^>]+?charset=([^'"\s]+)[^>]+>\s*/i, function (m0, m1) {
if (!/utf-8/i.test(m1)) {
return '';
}
return m0;
});
}
if (dirEntry.inspect || dirEntry.zimitRedirect) {
if (dirEntry.inspect) dirEntry = transformZimit.getZimitRedirect(dirEntry, html, cns);
if (dirEntry.zimitRedirect) {
return that.getDirEntryByPath(dirEntry.zimitRedirect).then(function (rd) {
return that.readUtf8File(rd, callback);
});
}
} else {
// DEV: Note that we cannot terminate regex below with $ because there is a (rogue?) mimetype
// of 'text/html;raw=true'
if (params.zimType === 'zimit' && /\/(?:x?html|css|javascript)\b/i.test(mimetype) &&
// DEV: We do not want to transform CSS and JS files that the user wishes to inspect the contents of
!(dirEntry.fromArticleList && /\/(?:css|javascript)\b/i.test(mimetype))) {
html = transformZimit.transformReplayUrls(dirEntry, html, mimetype);
}
callback(dirEntry, html);
}
}).catch(function (e) {
console.error('Error reading directory entry', e);
callback(dirEntry, '');
});
};
/**
* @callback callbackBinaryContent
* @param {Uint8Array} content binary content
*/
/**
* Read a binary file.
* @param {DirEntry} dirEntry
* @param {callbackBinaryContent} callback
*/
ZIMArchive.prototype.readBinaryFile = function (dirEntry, callback) {
if (!dirEntry) {
console.warn('Directory entry for requested URL was empty!');
return callback(dirEntry, '');
}
return dirEntry.readData().then(function (data) {
// Bypass everything if we're using Replay Worker
if (appstate.selectedArchive.zimType === 'zimit' && appstate.isReplayWorkerAvailable) {
callback(dirEntry, data);
return;
}
var mimetype = dirEntry.getMimetype();
if (dirEntry.inspect) {
dirEntry = transformZimit.getZimitRedirect(dirEntry, utf8.parse(data), appstate.selectedArchive.getContentNamespace());
if (dirEntry.zimitRedirect) {
return appstate.selectedArchive.getDirEntryByPath(dirEntry.zimitRedirect).then(function (rd) {
return appstate.selectedArchive.readBinaryFile(rd, callback);
})
}
} else {
// DEV: Note that we cannot terminate regex below with $ because there is a (rogue?) mimetype
// of 'text/html;raw=true'
if (params.zimType === 'zimit' && /\/(?:x?html|css|javascript)\b/i.test(mimetype) &&
// DEV: We do not want to transform CSS and JS files that the user wishes to inspect the contents of
!(dirEntry.fromArticleList && /\/(?:css|javascript)\b/i.test(mimetype))) {
data = transformZimit.transformReplayUrls(dirEntry, utf8.parse(data), mimetype);
}
callback(dirEntry, data);
}
});
};
/**
* Gets the UTF-8 string from the binary data
* @param {Blob} data Binary content
* @returns {String} UTF-8 string
*/
ZIMArchive.prototype.getUtf8FromData = function (data) {
var decData;
if (window.TextDecoder) {
decData = new TextDecoder('utf-8').decode(data);
} else {
// Support for IE11 and Edge Legacy - only support UTF-8 decoding
decData = utf8.parse(data);
}
return decData;
}
/**
* Searches the URL pointer list of Directory Entries by pathname
* @param {String} path The pathname of the DirEntry that is required (namespace + filename)
* @param {Boolean} zimitResolving A flag to indicate that the a Zimit path is in a lookup loop
* @param {String} originalPath Optional string used internally to prevent infinite loop
* @param {Boolean} followRedirects A flag to indicate that redirects should be followed
* @return {Promise<DirEntry>} A Promise that resolves to a Directory Entry, or null if not found.
*/
ZIMArchive.prototype.getDirEntryByPath = function (path, zimitResolving, originalPath, followRedirects) {
var that = this || appstate.selectedArchive;
if (that.zimType === 'zimit' && (!appstate.isReplayWorkerAvailable || followRedirects)) {
if (originalPath) appstate.originalPath = originalPath;
path = path.replace(/\?kiwix-display/, '');
// Correct obvious errors
if (!originalPath) {
var revisedPath = path.replace(/.*?((?:C\/A|A)\/(?!.*(?:C\/A|A)).+)$/, '$1');
if (revisedPath !== path) {
console.warn('*** Revised path from ' + path + '\nto: ' + revisedPath + ' ***');
if (appstate.selectedArchive.zimType === 'zimit') {
console.debug('*** DEV: Consider correcting this error in tranformZimit.js ***');
}
path = revisedPath;
}
}
}
return util.binarySearch(0, that.file.entryCount, function (i) {
return that.file.dirEntryByUrlIndex(i).then(function (dirEntry) {
var url = dirEntry.namespace + '/' + dirEntry.url;
if (path < url) {
return -1;
} else if (path > url) {
return 1;
} else {
return 0;
}
});
}).then(function (index) {
if (index === null) return null;
return that.file.dirEntryByUrlIndex(index);
}).then(function (dirEntry) {
// Filter Zimit dirEntries and do somee initial transforms
if (that.zimType === 'zimit' && !appstate.isReplayWorkerAvailable) {
dirEntry = transformZimit.filterReplayFiles(dirEntry);
}
if (!dirEntry && (!appstate.isReplayWorkerAvailable || followRedirects)) {
// We couldn't get the dirEntry, so look it up the Zimit header
if (!zimitResolving && that.zimType === 'zimit' && !/^(H|C\/H)\//.test(path) && path !== appstate.originalPath) {
// We need to look the file up in the Header namespace (double replacement ensures both types of ZIM are supported)
var oldPath = path;
path = path.replace(/^A\//, 'H/').replace(/^(C\/)A\//, '$1H/');
console.debug('DirEntry ' + oldPath + ' not found, looking up header: ' + path);
return that.getDirEntryByPath(path, true, oldPath, followRedirects);
} else if (zimitResolving && !appstate.isReplayWorkerAvailable && appstate.originalPath && appstate.originalPath === appstate.expectedArticleURLToBeDisplayed) {
// We couldn't find the Header, so try a fuzzy search only if the user is loading an article
path = appstate.originalPath;
var ns = path.replace(/^((?:C\/)?A\/).*/, '$1'); // If Zimit pseudo-namespaces are changed, will need to edit this
path = path.replace(ns, '');
path = path.toLocaleLowerCase(); // We are going to combine case-insensitive string comparison with regex matching
var rgxPath = path.replace(/([-/?.$^|*+()[{])/g, '\\$1'); // Make sure we escape regex characters
path = ns + path; // Add namespace back to path for full matching
// path = ns;
var search = {
rgxPrefix: new RegExp('.*' + rgxPath, 'i'),
searchUrlIndex: true,
lc: true, // Make the comparator (e.g. dirEntry.url) lowercase
size: 1,
found: 0
}
return fuzzySearch(path, search);
} else if (!appstate.isReplayWorkerAvailable) {
var newpath = path.replace(/^((?:A|C\/A)\/)[^/]+\/(.+)$/, '$1$2');
if (newpath === path) return null; // No further paths to explore!
console.log('Article ' + path + ' not available, but moving up one directory to compensate for ZIM coding error...');
return that.getDirEntryByPath(newpath);
} else {
// Not found!
return null;
}
} else {
// DEBUG: List found Directory Entry
// if (dirEntry) console.debug('Found ' + path);
return dirEntry;
}
});
};
/**
* Initiate a fuzzy search for dirEntries matching the search object
* @param {String} path Human-readable path to search for
* @param {Object} search The search object
* @returns {Promise<DirEntry>} A Promise that resolves to a Directory Entry, or null if not found
*/
function fuzzySearch (path, search) {
return new Promise(function (resolve, reject) {
console.log('Initiating fuzzy search for ' + path + '...');
uiUtil.pollSpinner('Fuzzy search for ' + path + '...', true);
var searchResolved = false;
// setTimeout(function () {
// if (!searchResolved) uiUtil.pollSpinner('Fuzzy search for ' + path + '...', true);
// }, 5000);
appstate.selectedArchive.findDirEntriesWithPrefixCaseSensitive(path, search, function (dirEntry) {
if (!search.found && dirEntry && dirEntry[0] && dirEntry[0].url) {
search.found++;
dirEntry = dirEntry[0];
dirEntry = transformZimit.filterReplayFiles(dirEntry);
if (dirEntry) console.debug('Found ' + dirEntry.url + ' in fuzzy search');
searchResolved = true;
resolve(dirEntry);
} else {
console.debug('No fuzzy search results found');
searchResolved = true;
resolve(null);
}
}, null);
});
}
/**
*
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.getRandomDirEntry = function (callback) {
// Prefer an article-only (v1) title pointer list, if available
var articleCount = this.file.articleCount || this.file.entryCount;
var index = Math.floor(Math.random() * articleCount);
this.file.dirEntryByTitleIndex(index).then(callback);
};
/**
* Read a Metadata string inside the ZIM file.
* @param {String} key
* @param {callbackMetadata} callback
*/
ZIMArchive.prototype.getMetadata = function (key, callback) {
var that = this;
this.getDirEntryByPath('M/' + key).then(function (dirEntry) {
if (dirEntry === null || dirEntry === undefined) {
console.warn('Title M/' + key + ' not found in the archive');
callback();
} else {
that.readUtf8File(dirEntry, function (dirEntryRead, data) {
callback(data);
});
}
}).catch(function (e) {
console.warn('Metadata with key ' + key + ' not found in the archive', e);
callback();
});
};
/**
* Add Metadata to the ZIM file
* @param {String} key The key of the metadata to add to the ZIM file
* @returns {Promise<String>} A Promise that resolves with the metadata string, if it exists
*/
ZIMArchive.prototype.addMetadataToZIMFile = function (key) {
var that = this;
var lcaseKey = key.toLocaleLowerCase();
return new Promise(function (resolve, reject) {
that.getMetadata(key, function (data) {
data = data || '';
that[lcaseKey] = data;
resolve(data);
});
});
};
/**
* Sets the Zimit metadata for the archive
*/
ZIMArchive.prototype.setZimitMetadata = function () {
var that = this;
// Get the landing page
return this.file.dirEntryByUrlIndex(this.file.mainPage).then(function (dirEntry) {
var findRedirectTarget = dirEntry.redirect ? function (dirEntry) {
// If the landing page is a redirect, we need to find the target
return that.file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(function (newEntry) {
return newEntry;
});
} : function (dirEntry) {
return Promise.resolve(dirEntry);
};
return findRedirectTarget(dirEntry).then(function (reEntry) {
// Note that in the case of zimit classic, the values below will be overwritten in the conditional clause
that.zimitPseudoContentNamespace = reEntry.namespace + '/';
that.zimitStartPage = reEntry.namespace + '/' + reEntry.url;
that.zimitPrefix = reEntry.url.replace(/\/.*$/, '') + '/';
if (that.zimType === 'zimit') {
return reEntry.readData().then(function (data) {
var html = that.getUtf8FromData(data);
var redirect = html.match(/window\.mainUrl\s*=\s*(['"])https?:\/\/([^/]+)(.+?)\1/);
if (redirect && redirect[2] && redirect[3]) {
// Logic added to distinguish between Type 0 and Type 1 Zimit ZIMs
var relativeZimitPrefix = (reEntry.namespace === 'C' ? 'A/' : '') + redirect[2];
var zimitStartPage = reEntry.namespace + '/' + relativeZimitPrefix + redirect[3];
// Store a full Zimit prefix in the archive object
that.zimitPrefix = relativeZimitPrefix + '/';
that.zimitStartPage = zimitStartPage;
that.zimitPseudoContentNamespace = reEntry.namespace + '/' + (reEntry.namespace === 'C' ? 'A/' : '');
}
});
}
});
}).catch(function (e) {
console.warn('Zimit metadata not found in this archive!', e);
});