Dario Ghunney Ware 58c1bccfcc renamed module: enterprise > proprietary
updating paths (DOCKER_SECURITY_ENABLE > ADDITIONAL_FEATURES)
2025-06-05 18:13:50 +01:00

146 lines
5.2 KiB
JavaScript

importScripts('./diff.js');
self.onmessage = async function (e) {
const { text1, text2, color1, color2 } = e.data;
console.log('Received text for comparison:', { text1, text2 });
const startTime = performance.now();
if (text1.trim() === "" || text2.trim() === "") {
self.postMessage({ status: 'error', message: 'One or both of the texts are empty.' });
return;
}
const words1 = text1.split(' ');
const words2 = text2.split(' ');
const MAX_WORD_COUNT = 150000;
const COMPLEX_WORD_COUNT = 50000;
const BATCH_SIZE = 5000; // Define a suitable batch size for processing
const OVERLAP_SIZE = 200; // Number of words to overlap - bigger increases accuracy but affects performance
const isComplex = words1.length > COMPLEX_WORD_COUNT || words2.length > COMPLEX_WORD_COUNT;
const isTooLarge = words1.length > MAX_WORD_COUNT || words2.length > MAX_WORD_COUNT;
let complexMessage = 'One or both of the provided documents are large files, accuracy of comparison may be reduced';
let tooLargeMessage = 'One or Both of the provided documents are too large to process';
// Listen for messages from the main thread
self.addEventListener('message', (event) => {
if (event.data.type === 'SET_TOO_LARGE_MESSAGE') {
tooLargeMessage = event.data.message;
}
if (event.data.type === 'SET_COMPLEX_MESSAGE') {
complexMessage = event.data.message;
}
});
if (isTooLarge) {
self.postMessage({
status: 'warning',
message: tooLargeMessage,
});
return;
} else {
if (isComplex) {
self.postMessage({
status: 'warning',
message: complexMessage,
});
}
// Perform diff operation depending on document size
const differences = isComplex
? await staggeredBatchDiff(words1, words2, color1, color2, BATCH_SIZE, OVERLAP_SIZE)
: diff(words1, words2, color1, color2);
console.log(`Diff operation took ${performance.now() - startTime} milliseconds`);
self.postMessage({ status: 'success', differences });
}
};
//Splits text into smaller batches to run through diff checking algorithms. overlaps the batches to help ensure
async function staggeredBatchDiff(words1, words2, color1, color2, batchSize, overlapSize) {
const differences = [];
const totalWords1 = words1.length;
const totalWords2 = words2.length;
let previousEnd1 = 0; // Track where the last batch ended in words1
let previousEnd2 = 0; // Track where the last batch ended in words2
// Function to determine if differences are large, differences that are too large indicate potential error in batching
const isLargeDifference = (differences) => {
return differences.length > 50;
};
while (previousEnd1 < totalWords1 || previousEnd2 < totalWords2) {
// Define the next chunk boundaries
const start1 = previousEnd1;
const end1 = Math.min(start1 + batchSize, totalWords1);
const start2 = previousEnd2;
const end2 = Math.min(start2 + batchSize, totalWords2);
//If difference is too high decrease batch size for more granular check
const dynamicBatchSize = isLargeDifference(differences) ? batchSize / 2 : batchSize;
// Adjust the size of the current chunk using dynamic batch size
const batchWords1 = words1.slice(start1, end1 + dynamicBatchSize);
const batchWords2 = words2.slice(start2, end2 + dynamicBatchSize);
// Include overlap from the previous chunk
const overlapWords1 = previousEnd1 > 0 ? words1.slice(Math.max(0, previousEnd1 - overlapSize), previousEnd1) : [];
const overlapWords2 = previousEnd2 > 0 ? words2.slice(Math.max(0, previousEnd2 - overlapSize), previousEnd2) : [];
// Combine overlaps and current batches for comparison
const combinedWords1 = overlapWords1.concat(batchWords1);
const combinedWords2 = overlapWords2.concat(batchWords2);
// Perform the diff on the combined words
const batchDifferences = diff(combinedWords1, combinedWords2, color1, color2);
differences.push(...batchDifferences);
// Update the previous end indices based on the results of this batch
previousEnd1 = end1;
previousEnd2 = end2;
}
return differences;
}
// Standard diff function for small text comparisons
function diff(words1, words2, color1, color2) {
console.log(`Starting diff between ${words1.length} words and ${words2.length} words`);
const matrix = Array.from({ length: words1.length + 1 }, () => Array(words2.length + 1).fill(0));
for (let i = 1; i <= words1.length; i++) {
for (let j = 1; j <= words2.length; j++) {
matrix[i][j] = words1[i - 1] === words2[j - 1]
? matrix[i - 1][j - 1] + 1
: Math.max(matrix[i][j - 1], matrix[i - 1][j]);
}
}
return backtrack(matrix, words1, words2, color1, color2);
}
// Backtrack function to find differences
function backtrack(matrix, words1, words2, color1, color2) {
let i = words1.length, j = words2.length;
const differences = [];
while (i > 0 || j > 0) {
if (i > 0 && j > 0 && words1[i - 1] === words2[j - 1]) {
differences.unshift(['black', words1[i - 1]]);
i--; j--;
} else if (j > 0 && (i === 0 || matrix[i][j] === matrix[i][j - 1])) {
differences.unshift([color2, words2[j - 1]]);
j--;
} else {
differences.unshift([color1, words1[i - 1]]);
i--;
}
}
return differences;
}