Skip to content

Commit

Permalink
Add HTML5 validation and cleaning to denest folders and set alternate…
Browse files Browse the repository at this point in the history
… entry.
  • Loading branch information
rtibbles committed Jan 7, 2025
1 parent 016f3f4 commit 0ee8c0e
Show file tree
Hide file tree
Showing 9 changed files with 648 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -552,15 +552,10 @@
const parentPropDefinedForInheritModal = Boolean(this.$refs.inheritModal?.parent);
this.newNodeIds = await Promise.all(
fileUploads.map(async (file, index) => {
let title;
if (file.metadata.title) {
title = file.metadata.title;
} else {
title = file.original_filename
.split('.')
.slice(0, -1)
.join('.');
}
const title = file.original_filename
.split('.')
.slice(0, -1)
.join('.');
const newNodeId = await this.createNode(
FormatPresets.has(file.preset) && FormatPresets.get(file.preset).kind_id,
{ title, ...file.metadata }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { createHash } from 'crypto';
import { createPredictableZip } from '../zipFile';
import JSZip from 'jszip';
import { createPredictableZip, findFirstHtml, findCommonRoot } from '../zipFile';

// Test data and expected MD5s
// All MD5s are generated using ricecooker's test_zip.py
Expand Down Expand Up @@ -87,3 +88,255 @@ describe('createPredictableZip', () => {
expect(getMD5(zipData1)).toBe(getMD5(zipData2));
});
});

describe('findFirstHtml function', () => {
let mockZip;

beforeEach(() => {
mockZip = new JSZip();
});

async function createTestZip(files) {
for (const [path, content] of Object.entries(files)) {
mockZip.file(path, content);
}
const zipContent = await mockZip.generateAsync({ type: 'blob' });
return new File([zipContent], 'test.zip', { type: 'application/zip' });
}

it('should prioritize root level index.html when multiple exist', async () => {
const file = await createTestZip({
'dist/index.html': '<html></html>',
'dist/subfolder/index.html': '<html></html>',
'dist/assets/style.css': 'body {}',
});

const entryPoint = await findFirstHtml(file);
expect(entryPoint).toBe('dist/index.html');
});

it('should find nested index.html when no root index exists', async () => {
const file = await createTestZip({
'dist/subfolder/index.html': '<html></html>',
'dist/about.html': '<html></html>',
});

const entryPoint = await findFirstHtml(file);
expect(entryPoint).toBe('dist/subfolder/index.html');
});

it('should handle deeply nested structures correctly', async () => {
const file = await createTestZip({
'project/dist/index.html': '<html></html>',
'project/dist/subfolder/index.html': '<html></html>',
});

const entryPoint = await findFirstHtml(file);
expect(entryPoint).toBe('project/dist/index.html');
});

it('should fall back to shallowest HTML file when no index.html exists', async () => {
const file = await createTestZip({
'dist/main.html': '<html></html>',
'dist/subfolder/about.html': '<html></html>',
});

const entryPoint = await findFirstHtml(file);
expect(entryPoint).toBe('dist/main.html');
});

it('should error for corrupt zip files', async () => {
const file = new File(['not a zip file'], 'test.zip', { type: 'application/zip' });

await expect(async () => await findFirstHtml(file)).rejects.toThrow();
});
});

describe('findCommonRoot', () => {
// Helper to create a JSZip file structure and return its files object
async function createZipStructure(paths) {
const zip = new JSZip();

// First create all directories to ensure proper structure
paths.forEach(path => {
if (path.endsWith('/')) {
zip.folder(path.slice(0, -1)); // Remove trailing slash when creating folder
}
});

// Then add files
paths.forEach(path => {
if (!path.endsWith('/')) {
zip.file(path, 'content'); // Add some content for files
}
});

await zip.generateAsync({ type: 'blob' }); // Generate to ensure proper structure
return zip.files;
}

it('should return empty string for empty files object', async () => {
const files = await createZipStructure([]);
expect(findCommonRoot(files)).toBe('');
});

it('should return empty string when no common root exists', async () => {
const files = await createZipStructure(['file1.txt', 'file2.txt', 'different/path/file3.txt']);
expect(findCommonRoot(files)).toBe('');
});

it('should find single level common root', async () => {
const files = await createZipStructure([
'dist/',
'dist/file1.txt',
'dist/file2.txt',
'dist/subfolder/',
'dist/subfolder/file3.txt',
]);
expect(findCommonRoot(files)).toBe('dist');
});

it('should ignore directory entries when finding common root', async () => {
const files = await createZipStructure([
'dist/',
'dist/css/',
'dist/js/',
'dist/index.html',
'dist/css/style.css',
]);
expect(findCommonRoot(files)).toBe('dist');
});

it('should find deep common root ignoring directories', async () => {
const files = await createZipStructure([
'path/to/my/files/',
'path/to/my/files/subfolder/',
'path/to/my/files/file1.txt',
'path/to/my/files/file2.txt',
'path/to/my/files/subfolder/file3.txt',
]);
expect(findCommonRoot(files)).toBe('path/to/my/files');
});

it('should not treat partial directory names as common', async () => {
const files = await createZipStructure([
'mydir/',
'mydir/file1.txt',
'mydir2/',
'mydir2/file2.txt',
]);
expect(findCommonRoot(files)).toBe('');
});

it('should handle single file ignoring its directory', async () => {
const files = await createZipStructure(['path/to/files/', 'path/to/files/single.txt']);
expect(findCommonRoot(files)).toBe('path/to/files');
});

it('should not return common root if any file is at root level', async () => {
const files = await createZipStructure([
'rootfile.txt',
'folder/',
'folder/file1.txt',
'folder/file2.txt',
]);
expect(findCommonRoot(files)).toBe('');
});

it('should handle complex nested structures with directories', async () => {
const files = await createZipStructure([
'project/',
'project/src/',
'project/src/dist/',
'project/src/dist/build/',
'project/src/dist/build/index.html',
'project/src/dist/build/assets/',
'project/src/dist/build/assets/style.css',
'project/src/dist/build/assets/js/',
'project/src/dist/build/assets/js/main.js',
]);
expect(findCommonRoot(files)).toBe('project/src/dist/build');
});

it('should handle paths with dots ignoring directories', async () => {
const files = await createZipStructure([
'my.folder/',
'my.folder/sub.dir/',
'my.folder/file1.txt',
'my.folder/file2.txt',
'my.folder/sub.dir/file3.txt',
]);
expect(findCommonRoot(files)).toBe('my.folder');
});

it('should handle mixed case paths correctly', async () => {
const files = await createZipStructure(['Dist/', 'dist/file1.txt', 'DIST/file2.txt']);
expect(findCommonRoot(files)).toBe(''); // Case-sensitive comparison
});

it('should handle empty directories correctly', async () => {
const files = await createZipStructure(['dist/', 'dist/empty/', 'dist/file1.txt']);
expect(findCommonRoot(files)).toBe('dist');
});

it('should handle deeply nested single file correctly', async () => {
const files = await createZipStructure(['very/deep/nested/path/to/file.txt']);
expect(findCommonRoot(files)).toBe('very/deep/nested/path/to');
});

it('should handle multiple files at different depths correctly', async () => {
const files = await createZipStructure([
'root/deep/path/file1.txt',
'root/deep/file2.txt',
'root/deep/path/to/file3.txt',
]);
expect(findCommonRoot(files)).toBe('root/deep');
});
it('should handle paths with unicode characters', async () => {
const files = await createZipStructure([
'ユーザー/',
'ユーザー/フォルダ/',
'ユーザー/file1.txt',
'ユーザー/フォルダ/file2.txt',
]);
expect(findCommonRoot(files)).toBe('ユーザー');
});

it('should handle paths with spaces and special characters', async () => {
const files = await createZipStructure([
'My Documents/',
'My Documents/Some & Files/',
'My Documents/Some & Files/file1.txt',
'My Documents/Some & Files/file (2).txt',
]);
expect(findCommonRoot(files)).toBe('My Documents/Some & Files');
});

it('should handle paths where one is a prefix of another directory name', async () => {
const files = await createZipStructure([
'base/',
'base-extended/',
'base/file1.txt',
'base-extended/file2.txt',
]);
expect(findCommonRoot(files)).toBe('');
});

it('should handle files with same names at different levels', async () => {
const files = await createZipStructure([
'folder1/index.html',
'folder1/subfolder/index.html',
'folder1/subfolder/subsub/index.html',
]);
expect(findCommonRoot(files)).toBe('folder1');
});

it('should handle path segments that are numerically sequential', async () => {
const files = await createZipStructure([
'path/1/file.txt',
'path/2/file.txt',
'path/10/file.txt',
]);
expect(findCommonRoot(files)).toBe('path');
});
});
94 changes: 92 additions & 2 deletions contentcuration/contentcuration/frontend/shared/utils/zipFile.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import JSZip from 'jszip';
import pako from 'pako';
import crc32 from 'crc-32';

Expand Down Expand Up @@ -171,7 +172,7 @@ function calculateCrc32(data) {
* // Do something with the zipBuffer
* });
*/
async function createPredictableZip(files) {
export async function createPredictableZip(files) {
const sortedPaths = Object.keys(files).sort();
const entries = [];
let offset = 0;
Expand Down Expand Up @@ -240,4 +241,93 @@ async function createPredictableZip(files) {
]);
}

export { createPredictableZip };
/**
* Finds the common root directory in a zip structure
* @param {Object} files - JSZip files object
* @returns {string} - Common root path or empty string if none
*/
export function findCommonRoot(files) {
const paths = Object.entries(files)
// Get only non-directory file paths
.filter(([, file]) => !file.dir)
// Extract directory paths from file paths
.map(([path]) => path.split('/').slice(0, -1));

if (paths.length === 0) {
return '';
}

// If only one file, return its directory path
if (paths.length === 1) {
return paths[0].join('/');
}

const firstPath = paths[0];
const commonParts = [];
let index = 0;

// Keep checking parts until we run out or find a mismatch
while (index < firstPath.length) {
const part = firstPath[index];

// Check if this part matches in all other paths
for (const path of paths.slice(1)) {
if (index >= path.length || path[index] !== part) {
return commonParts.join('/');
}
}

commonParts.push(part);
index++;
}
return commonParts.join('/');
}

/**
* Finds the first HTML file in a zip file, prioritizing index.html
* @param {File} file - The zip file to analyze
* @returns {Promise<string|null>} - Path to the HTML file or null if none found
*/
export async function findFirstHtml(file) {
const zip = new JSZip();

const zipContent = await zip.loadAsync(file);
const files = zipContent.files;
const htmlFiles = Object.keys(files).filter(path => path.toLowerCase().endsWith('.html'));

if (htmlFiles.length === 0) {
return null;
}

// Find common root path
const commonRoot = findCommonRoot(files);
const rootPrefix = commonRoot ? commonRoot + '/' : '';

// Remove common root from paths for comparison
const normalizedPaths = htmlFiles.map(path => ({
original: path,
normalized: commonRoot ? path.slice(rootPrefix.length) : path,
}));

// First priority: index.html at root level after removing common path
const rootIndex = normalizedPaths.find(p => p.normalized === 'index.html');
if (rootIndex) {
return rootIndex.original;
}

// Second priority: any index.html
const indexFile = normalizedPaths.find(p => p.normalized.split('/').pop() === 'index.html');
if (indexFile) {
return indexFile.original;
}

// Last resort: first HTML file at the shallowest level
return normalizedPaths.sort((a, b) => {
const depthA = a.normalized.split('/').length;
const depthB = b.normalized.split('/').length;
if (depthA !== depthB) {
return depthA - depthB;
}
return a.normalized.length - b.normalized.length;
})[0].original;
}
Loading

0 comments on commit 0ee8c0e

Please sign in to comment.