Skip to content

Commit

Permalink
Use Magika to more robustly identify text files to send for indexing
Browse files Browse the repository at this point in the history
- `file-type' doesn't handle mis-labelled files or files without
   extensions well

- Only show supported file types in file selector dialog on Desktop app
  Use Magika to get list of text file extensions. Combine with other
  supported extensions to get complete list of supported file extensions.
  Use it to limit selectable files in the File Open dialog.

  Note: Folder selector will index text files with no extensions as well
  • Loading branch information
debanjum committed Apr 10, 2024
1 parent f040418 commit f2dc970
Show file tree
Hide file tree
Showing 3 changed files with 646 additions and 59 deletions.
32 changes: 25 additions & 7 deletions src/interface/desktop/main.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
const { app, BrowserWindow, ipcMain, Tray, Menu, nativeImage, shell } = require('electron');
const FileType = require('file-type');
const Magika = require('magika').MagikaNode;
const todesktop = require("@todesktop/runtime");
const khojPackage = require('./package.json');

Expand All @@ -15,8 +15,8 @@ const KHOJ_URL = 'https://app.khoj.dev';

const Store = require('electron-store');

const validFileTypes = ['org', 'md', 'markdown', 'txt', 'html', 'xml', 'pdf']

const magika = new Magika();
let validFileTypes;
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']

const schema = {
Expand Down Expand Up @@ -68,6 +68,7 @@ const schema = {
}
};

let isMagikaLoaded = false;
let syncing = false;
let state = {}
const store = new Store({ schema });
Expand Down Expand Up @@ -113,11 +114,19 @@ function filenameToMimeType (filename) {
}

async function isPlainTextFile(filePath) {
const fileType = await FileType.fromFile(filePath);
if (!fileType) {
if (!isMagikaLoaded) {
await magika.load();
isMagikaLoaded = true;
}
try {
const fileContent = fs.readFileSync(filePath);
const fileType = await magika.identifyBytes(fileContent);
const fileLabel = magika.config.labels.filter(l => l.name == fileType.label)?.[0]
return fileLabel?.is_text
} catch (err) {
console.error("Failed to identify file type: ", err);
return false;
}
return fileType.mime.startsWith('text/');
}

async function processDirectory(filesToPush, folder) {
Expand Down Expand Up @@ -249,9 +258,18 @@ async function pushDataToKhoj (regenerate = false) {
pushDataToKhoj();

async function handleFileOpen (type) {
if (!isMagikaLoaded) {
await magika.load();
isMagikaLoaded = true;
validFileTypes = [
"org", "md", "pdf",
// all text file extensions known to Magika
...magika.config.labels.filter(l => l.is_text == true).map(l => l.name)];
}

let { canceled, filePaths } = {canceled: true, filePaths: []};
if (type === 'file') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files" }] }));
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openFile' ], filters: [{ name: "Valid Khoj Files", extensions: validFileTypes }] }));
} else if (type === 'folder') {
({ canceled, filePaths } = await dialog.showOpenDialog({properties: ['openDirectory' ]}));
}
Expand Down
2 changes: 1 addition & 1 deletion src/interface/desktop/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@
"axios": "^1.6.4",
"cron": "^2.4.3",
"electron-store": "^8.1.0",
"file-type": "^16.2.0"
"magika": "^0.2.13"
}
}
Loading

0 comments on commit f2dc970

Please sign in to comment.