diff --git a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java index 432fad101..c99a2ade7 100644 --- a/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java +++ b/app/core/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -27,7 +27,6 @@ public class TextFinder extends PDFTextStripper { public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch) throws IOException { - super(); this.searchTerm = searchTerm; this.useRegex = useRegex; this.wholeWordSearch = wholeWordSearch; @@ -68,11 +67,15 @@ public class TextFinder extends PDFTextStripper { } String processedSearchTerm = this.searchTerm.trim(); + if (processedSearchTerm.isEmpty()) { + super.endPage(page); + return; + } String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; if (this.wholeWordSearch) { if (processedSearchTerm.length() == 1 && Character.isDigit(processedSearchTerm.charAt(0))) { - regex = "(? redactionAreas, boolean convertToImage) throws Exception { + private void testManualRedaction(List redactionAreas, boolean convertToImage) + throws Exception { ManualRedactPdfRequest request = createManualRedactPdfRequest(); request.setRedactions(redactionAreas); request.setConvertPDFToImage(convertToImage); @@ -123,18 +128,16 @@ class RedactControllerTest { @BeforeEach void setUp() throws IOException { - mockPdfFile = new MockMultipartFile( - "fileInput", - "test.pdf", - "application/pdf", - createSimplePdfContent() - ); + mockPdfFile = + new MockMultipartFile( + "fileInput", "test.pdf", "application/pdf", createSimplePdfContent()); // Mock PDF document and related objects mockDocument = mock(PDDocument.class); mockPages = mock(PDPageTree.class); mockPage = mock(PDPage.class); - org.apache.pdfbox.pdmodel.PDDocumentCatalog mockCatalog = mock(org.apache.pdfbox.pdmodel.PDDocumentCatalog.class); + org.apache.pdfbox.pdmodel.PDDocumentCatalog mockCatalog = + mock(org.apache.pdfbox.pdmodel.PDDocumentCatalog.class); // Setup document structure properly when(pdfDocumentFactory.load(any(MockMultipartFile.class))).thenReturn(mockDocument); @@ -153,12 +156,14 @@ class RedactControllerTest { when(mockPage.getMediaBox()).thenReturn(pageRect); when(mockPage.getBBox()).thenReturn(pageRect); - InputStream mockInputStream = new ByteArrayInputStream("BT /F1 12 Tf 100 200 Td (test content) Tj ET".getBytes()); + InputStream mockInputStream = + new ByteArrayInputStream("BT /F1 12 Tf 100 200 Td (test content) Tj ET".getBytes()); when(mockPage.getContents()).thenReturn(mockInputStream); when(mockPage.hasContents()).thenReturn(true); - org.apache.pdfbox.cos.COSDocument mockCOSDocument = mock(org.apache.pdfbox.cos.COSDocument.class); + org.apache.pdfbox.cos.COSDocument mockCOSDocument = + mock(org.apache.pdfbox.cos.COSDocument.class); org.apache.pdfbox.cos.COSStream mockCOSStream = mock(org.apache.pdfbox.cos.COSStream.class); when(mockDocument.getDocument()).thenReturn(mockCOSDocument); when(mockCOSDocument.createCOSStream()).thenReturn(mockCOSStream); @@ -167,11 +172,14 @@ class RedactControllerTest { when(mockCOSStream.createOutputStream()).thenReturn(mockOutputStream); when(mockCOSStream.createOutputStream(any())).thenReturn(mockOutputStream); - doAnswer(invocation -> { - ByteArrayOutputStream baos = invocation.getArgument(0); - baos.write("Mock PDF Content".getBytes()); - return null; - }).when(mockDocument).save(any(ByteArrayOutputStream.class)); + doAnswer( + invocation -> { + ByteArrayOutputStream baos = invocation.getArgument(0); + baos.write("Mock PDF Content".getBytes()); + return null; + }) + .when(mockDocument) + .save(any(ByteArrayOutputStream.class)); doNothing().when(mockDocument).close(); // Initialize a real document for unit tests @@ -185,7 +193,8 @@ class RedactControllerTest { // Set up basic page resources PDResources resources = new PDResources(); - resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + resources.put( + COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); realPage.setResources(resources); } @@ -222,7 +231,14 @@ class RedactControllerTest { @Test @DisplayName("Should redact multiple search terms") void redactMultipleSearchTerms() throws Exception { - testAutoRedaction("confidential\nsecret\nprivate\nclassified", false, true, "#FF0000", 2.0f, false, true); + testAutoRedaction( + "confidential\nsecret\nprivate\nclassified", + false, + true, + "#FF0000", + 2.0f, + false, + true); } @Test @@ -250,8 +266,12 @@ class RedactControllerTest { when(page.getBBox()).thenReturn(pageRect); when(page.hasContents()).thenReturn(true); - InputStream mockInputStream = new ByteArrayInputStream( - ("BT /F1 12 Tf 100 200 Td (page " + i + " content with confidential info) Tj ET").getBytes()); + InputStream mockInputStream = + new ByteArrayInputStream( + ("BT /F1 12 Tf 100 200 Td (page " + + i + + " content with confidential info) Tj ET") + .getBytes()); when(page.getContents()).thenReturn(mockInputStream); pageList.add(page); @@ -285,7 +305,8 @@ class RedactControllerTest { when(mockPages.get(0)).thenReturn(mockPage); - org.apache.pdfbox.pdmodel.PDDocumentInformation mockInfo = mock(org.apache.pdfbox.pdmodel.PDDocumentInformation.class); + org.apache.pdfbox.pdmodel.PDDocumentInformation mockInfo = + mock(org.apache.pdfbox.pdmodel.PDDocumentInformation.class); when(mockDocument.getDocumentInformation()).thenReturn(mockInfo); ResponseEntity response = redactController.redactPdf(request); @@ -311,23 +332,27 @@ class RedactControllerTest { @Test @DisplayName("Should handle email pattern redaction") void handleEmailPatternRedaction() throws Exception { - testAutoRedaction("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false, "#0000FF", 1.5f, false, true); + testAutoRedaction( + "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", + true, false, "#0000FF", 1.5f, false, true); } @Test @DisplayName("Should handle phone number patterns") void handlePhoneNumberPatterns() throws Exception { - testAutoRedaction("\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", true, false, "#FF0000", 1.0f, false, true); + testAutoRedaction( + "\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", true, false, "#FF0000", 1.0f, false, true); } @ParameterizedTest - @ValueSource(strings = { - "\\d{3}-\\d{2}-\\d{4}", // SSN pattern - "\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}", // Credit card pattern - "\\b[A-Z]{2,}\\b", // Uppercase words - "\\$\\d+\\.\\d{2}", // Currency pattern - "\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b" // IP address pattern - }) + @ValueSource( + strings = { + "\\d{3}-\\d{2}-\\d{4}", // SSN pattern + "\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}", // Credit card pattern + "\\b[A-Z]{2,}\\b", // Uppercase words + "\\$\\d+\\.\\d{2}", // Currency pattern + "\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b" // IP address pattern + }) @DisplayName("Should handle various regex patterns") void handleVariousRegexPatterns(String regexPattern) throws Exception { testAutoRedaction(regexPattern, true, false, "#000000", 1.0f, false, true); @@ -519,8 +544,10 @@ class RedactControllerTest { when(page.getBBox()).thenReturn(pageRect); when(page.hasContents()).thenReturn(true); - InputStream mockInputStream = new ByteArrayInputStream( - ("BT /F1 12 Tf 100 200 Td (page " + i + " content) Tj ET").getBytes()); + InputStream mockInputStream = + new ByteArrayInputStream( + ("BT /F1 12 Tf 100 200 Td (page " + i + " content) Tj ET") + .getBytes()); when(page.getContents()).thenReturn(mockInputStream); pageList.add(page); @@ -588,36 +615,38 @@ class RedactControllerTest { request.setFileInput(null); request.setListOfText("test"); - assertDoesNotThrow(() -> { - try { - redactController.redactPdf(request); - } catch (Exception e) { - assertNotNull(e); - } - }); + assertDoesNotThrow( + () -> { + try { + redactController.redactPdf(request); + } catch (Exception e) { + assertNotNull(e); + } + }); } @Test @DisplayName("Should handle malformed PDF gracefully") void handleMalformedPdfGracefully() throws Exception { - MockMultipartFile malformedFile = new MockMultipartFile( - "fileInput", - "malformed.pdf", - "application/pdf", - "Not a real PDF content".getBytes() - ); + MockMultipartFile malformedFile = + new MockMultipartFile( + "fileInput", + "malformed.pdf", + "application/pdf", + "Not a real PDF content".getBytes()); RedactPdfRequest request = new RedactPdfRequest(); request.setFileInput(malformedFile); request.setListOfText("test"); - assertDoesNotThrow(() -> { - try { - redactController.redactPdf(request); - } catch (Exception e) { - assertNotNull(e); - } - }); + assertDoesNotThrow( + () -> { + try { + redactController.redactPdf(request); + } catch (Exception e) { + assertNotNull(e); + } + }); } @Test @@ -723,14 +752,24 @@ class RedactControllerTest { } @ParameterizedTest - @ValueSource(strings = {"#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00", "0000FF"}) + @ValueSource( + strings = { + "#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00", + "0000FF" + }) @DisplayName("Should handle various valid color formats") void handleVariousValidColorFormats(String colorInput) throws Exception { Color result = redactController.decodeOrDefault(colorInput); assertNotNull(result); - assertTrue(result.getRed() >= 0 && result.getRed() <= 255, "Red component should be in valid range"); - assertTrue(result.getGreen() >= 0 && result.getGreen() <= 255, "Green component should be in valid range"); - assertTrue(result.getBlue() >= 0 && result.getBlue() <= 255, "Blue component should be in valid range"); + assertTrue( + result.getRed() >= 0 && result.getRed() <= 255, + "Red component should be in valid range"); + assertTrue( + result.getGreen() >= 0 && result.getGreen() <= 255, + "Green component should be in valid range"); + assertTrue( + result.getBlue() >= 0 && result.getBlue() <= 255, + "Blue component should be in valid range"); } @Test @@ -755,16 +794,18 @@ class RedactControllerTest { Set targetWords = Set.of("confidential"); - List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + List tokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, false); assertNotNull(tokens); assertFalse(tokens.isEmpty()); String reconstructedText = extractTextFromTokens(tokens); - assertFalse(reconstructedText.contains("confidential"), - "Target text should be replaced with placeholder"); - assertTrue(reconstructedText.contains("document"), - "Non-target text should remain"); + assertFalse( + reconstructedText.contains("confidential"), + "Target text should be replaced with placeholder"); + assertTrue(reconstructedText.contains("document"), "Non-target text should remain"); } @Test @@ -774,7 +815,9 @@ class RedactControllerTest { Set targetWords = Set.of("secret"); - List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + List tokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, false); assertNotNull(tokens); @@ -785,7 +828,9 @@ class RedactControllerTest { if (array.getObject(i) instanceof COSString cosString) { String text = cosString.getString(); if (text.contains("secret")) { - fail("Target text 'secret' should have been redacted from TJ array"); + fail( + "Target text 'secret' should have been redacted from TJ" + + " array"); } foundModifiedTJArray = true; } @@ -803,21 +848,33 @@ class RedactControllerTest { Set targetWords = Set.of("redact"); List originalTokens = getOriginalTokens(); - List filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + List filteredTokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, false); - long originalNonTextCount = originalTokens.stream() - .filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName())) - .count(); + long originalNonTextCount = + originalTokens.stream() + .filter( + token -> + token instanceof Operator op + && !redactController.isTextShowingOperator( + op.getName())) + .count(); - long filteredNonTextCount = filteredTokens.stream() - .filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName())) - .count(); + long filteredNonTextCount = + filteredTokens.stream() + .filter( + token -> + token instanceof Operator op + && !redactController.isTextShowingOperator( + op.getName())) + .count(); - assertTrue(filteredNonTextCount > 0, - "Non-text operators should be preserved"); + assertTrue(filteredNonTextCount > 0, "Non-text operators should be preserved"); - assertTrue(filteredNonTextCount >= originalNonTextCount / 2, - "A reasonable number of non-text operators should be preserved"); + assertTrue( + filteredNonTextCount >= originalNonTextCount / 2, + "A reasonable number of non-text operators should be preserved"); } @Test @@ -827,7 +884,9 @@ class RedactControllerTest { Set targetWords = Set.of("\\d{3}-\\d{2}-\\d{4}"); // SSN pattern - List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, true, false); + List tokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, true, false); String reconstructedText = extractTextFromTokens(tokens); assertFalse(reconstructedText.contains("111-22-3333"), "SSN should be redacted"); @@ -841,7 +900,9 @@ class RedactControllerTest { Set targetWords = Set.of("test"); - List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, true); + List tokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, true); String reconstructedText = extractTextFromTokens(tokens); assertTrue(reconstructedText.contains("testing"), "Partial matches should remain"); @@ -856,11 +917,14 @@ class RedactControllerTest { Set targetWords = Set.of("sensitive"); - List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + List tokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, false); String reconstructedText = extractTextFromTokens(tokens); - assertFalse(reconstructedText.contains("sensitive"), - "Text should be redacted regardless of operator type"); + assertFalse( + reconstructedText.contains("sensitive"), + "Text should be redacted regardless of operator type"); } @Test @@ -884,7 +948,10 @@ class RedactControllerTest { void shouldHandleEmptyTokenList() throws Exception { List emptyTokens = Collections.emptyList(); - assertDoesNotThrow(() -> redactController.writeFilteredContentStream(realDocument, realPage, emptyTokens)); + assertDoesNotThrow( + () -> + redactController.writeFilteredContentStream( + realDocument, realPage, emptyTokens)); assertNotNull(realPage.getContents(), "Page should still have content stream"); } @@ -906,20 +973,27 @@ class RedactControllerTest { @DisplayName("Placeholder creation should maintain text width") void shouldCreateWidthMatchingPlaceholder() throws Exception { String originalText = "confidential"; - String placeholder = redactController.createPlaceholder(originalText); + String placeholder = + redactController.createPlaceholderWithFont( + originalText, new PDType1Font(Standard14Fonts.FontName.HELVETICA)); - assertEquals(originalText.length(), placeholder.length(), - "Placeholder should maintain character count for width preservation"); + assertEquals( + originalText.length(), + placeholder.length(), + "Placeholder should maintain character count for width preservation"); } @Test @DisplayName("Placeholder should handle special characters") void shouldHandleSpecialCharactersInPlaceholder() throws Exception { String originalText = "café naïve"; - String placeholder = redactController.createPlaceholder(originalText); + String placeholder = + redactController.createPlaceholderWithFont( + originalText, new PDType1Font(Standard14Fonts.FontName.HELVETICA)); assertEquals(originalText.length(), placeholder.length()); - assertFalse(placeholder.contains("café"), "Placeholder should not contain original text"); + assertFalse( + placeholder.contains("café"), "Placeholder should not contain original text"); } @Test @@ -929,7 +1003,9 @@ class RedactControllerTest { Set targetWords = Set.of("secret"); - List filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + List filteredTokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, false); redactController.writeFilteredContentStream(realDocument, realPage, filteredTokens); assertNotNull(realPage.getContents()); @@ -946,15 +1022,21 @@ class RedactControllerTest { Set targetWords = Set.of("confidential"); - List filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + List filteredTokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, false); - long filteredPositioning = filteredTokens.stream() - .filter(token -> token instanceof Operator op && - (op.getName().equals("Td") || op.getName().equals("TD") || op.getName().equals("Tm"))) - .count(); + long filteredPositioning = + filteredTokens.stream() + .filter( + token -> + token instanceof Operator op + && ("Td".equals(op.getName()) + || "TD".equals(op.getName()) + || "Tm".equals(op.getName()))) + .count(); - assertTrue(filteredPositioning > 0, - "Positioning operators should be preserved"); + assertTrue(filteredPositioning > 0, "Positioning operators should be preserved"); } @Test @@ -966,16 +1048,21 @@ class RedactControllerTest { } realDocument.addPage(realPage); realPage.setResources(new PDResources()); - realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + realPage.getResources() + .put( + COSName.getPDFName("F1"), + new PDType1Font(Standard14Fonts.FontName.HELVETICA)); - try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + try (PDPageContentStream contentStream = + new PDPageContentStream(realDocument, realPage)) { contentStream.setLineWidth(2); contentStream.moveTo(100, 100); contentStream.lineTo(200, 200); contentStream.stroke(); contentStream.beginText(); - contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.setFont( + realPage.getResources().getFont(COSName.getPDFName("F1")), 12); contentStream.newLineAtOffset(50, 750); contentStream.showText("This is a complex document with "); contentStream.setTextRise(5); @@ -990,19 +1077,27 @@ class RedactControllerTest { Set targetWords = Set.of("confidential"); - List tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); + List tokens = + redactController.createTokensWithoutTargetText( + realDocument, realPage, targetWords, false, false); assertNotNull(tokens); assertFalse(tokens.isEmpty()); String reconstructedText = extractTextFromTokens(tokens); - assertFalse(reconstructedText.contains("confidential"), "Target text should be redacted"); + assertFalse( + reconstructedText.contains("confidential"), "Target text should be redacted"); - boolean hasGraphicsOperators = tokens.stream() - .anyMatch(token -> token instanceof Operator op && - (op.getName().equals("re") || op.getName().equals("f") || - op.getName().equals("m") || op.getName().equals("l") || - op.getName().equals("S"))); + boolean hasGraphicsOperators = + tokens.stream() + .anyMatch( + token -> + token instanceof Operator op + && ("re".equals(op.getName()) + || "f".equals(op.getName()) + || "m".equals(op.getName()) + || "l".equals(op.getName()) + || "S".equals(op.getName()))); assertTrue(hasGraphicsOperators, "Graphics operators should be preserved"); } @@ -1019,10 +1114,12 @@ class RedactControllerTest { // Create resources PDResources resources = new PDResources(); - resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + resources.put( + COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); realPage.setResources(resources); - try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { + try (PDPageContentStream contentStream = + new PDPageContentStream(realDocument, realPage)) { contentStream.beginText(); contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); contentStream.newLineAtOffset(50, 750); @@ -1180,7 +1277,8 @@ class RedactControllerTest { } realDocument.addPage(realPage); realPage.setResources(new PDResources()); - realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + realPage.getResources() + .put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { contentStream.beginText(); @@ -1198,7 +1296,8 @@ class RedactControllerTest { } realDocument.addPage(realPage); realPage.setResources(new PDResources()); - realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + realPage.getResources() + .put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { contentStream.beginText(); @@ -1221,7 +1320,8 @@ class RedactControllerTest { } realDocument.addPage(realPage); realPage.setResources(new PDResources()); - realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + realPage.getResources() + .put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { contentStream.setLineWidth(2); @@ -1248,7 +1348,8 @@ class RedactControllerTest { } realDocument.addPage(realPage); realPage.setResources(new PDResources()); - realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); + realPage.getResources() + .put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { contentStream.beginText(); @@ -1266,28 +1367,29 @@ class RedactControllerTest { // Helper for token creation private List createSampleTokenList() { return List.of( - Operator.getOperator("BT"), - COSName.getPDFName("F1"), - new COSFloat(12), - Operator.getOperator("Tf"), - new COSString("Sample text"), - Operator.getOperator("Tj"), - Operator.getOperator("ET") - ); + Operator.getOperator("BT"), + COSName.getPDFName("F1"), + new COSFloat(12), + Operator.getOperator("Tf"), + new COSString("Sample text"), + Operator.getOperator("Tj"), + Operator.getOperator("ET")); } private List getOriginalTokens() throws Exception { // Create a new page to avoid side effects from other tests PDPage pageForTokenExtraction = new PDPage(PDRectangle.A4); pageForTokenExtraction.setResources(realPage.getResources()); - try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, pageForTokenExtraction)) { - contentStream.beginText(); - contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); - contentStream.newLineAtOffset(50, 750); - contentStream.showText("Original content"); - contentStream.endText(); + try (PDPageContentStream contentStream = + new PDPageContentStream(realDocument, pageForTokenExtraction)) { + contentStream.beginText(); + contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); + contentStream.newLineAtOffset(50, 750); + contentStream.showText("Original content"); + contentStream.endText(); } - return redactController.createTokensWithoutTargetText(pageForTokenExtraction, Collections.emptySet(), false, false); + return redactController.createTokensWithoutTargetText( + realDocument, pageForTokenExtraction, Collections.emptySet(), false, false); } private String extractTextFromTokens(List tokens) { diff --git a/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java b/app/core/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java similarity index 69% rename from stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java rename to app/core/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java index ebb5bebf7..3e5092070 100644 --- a/stirling-pdf/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java +++ b/app/core/src/test/java/stirling/software/SPDF/pdf/TextFinderTest.java @@ -1,5 +1,11 @@ package stirling.software.SPDF.pdf; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + import java.io.IOException; import java.util.List; @@ -10,11 +16,6 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.jupiter.api.AfterEach; -import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; @@ -34,33 +35,44 @@ class TextFinderTest { private PDPage page; // Helpers - private void testTextFinding(String pageContent, String searchTerm, boolean useRegex, boolean wholeWord, - String[] expectedTexts, int expectedCount) throws IOException { + private void testTextFinding( + String pageContent, + String searchTerm, + boolean useRegex, + boolean wholeWord, + String[] expectedTexts, + int expectedCount) + throws IOException { addTextToPage(pageContent); TextFinder textFinder = new TextFinder(searchTerm, useRegex, wholeWord); textFinder.getText(document); List foundTexts = textFinder.getFoundTexts(); - assertEquals(expectedCount, foundTexts.size(), - String.format("Expected %d matches for search term '%s'", expectedCount, searchTerm)); + assertEquals( + expectedCount, + foundTexts.size(), + String.format( + "Expected %d matches for search term '%s'", expectedCount, searchTerm)); if (expectedTexts != null) { for (String expectedText : expectedTexts) { - assertTrue(foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)), - String.format("Expected to find text: '%s'", expectedText)); + assertTrue( + foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)), + String.format("Expected to find text: '%s'", expectedText)); } } // Verify basic properties of found texts - foundTexts.forEach(text -> { - assertNotNull(text.getText()); - assertTrue(text.getX1() >= 0); - assertTrue(text.getY1() >= 0); - assertTrue(text.getX2() >= text.getX1()); - assertTrue(text.getY2() >= text.getY1()); - assertEquals(0, text.getPageIndex()); // Single page test - }); + foundTexts.forEach( + text -> { + assertNotNull(text.getText()); + assertTrue(text.getX1() >= 0); + assertTrue(text.getY1() >= 0); + assertTrue(text.getX2() >= text.getX1()); + assertTrue(text.getY2() >= text.getY1()); + assertEquals(0, text.getPageIndex()); // Single page test + }); } @BeforeEach @@ -84,25 +96,37 @@ class TextFinderTest { @Test @DisplayName("Should find simple text correctly") void findSimpleText() throws IOException { - testTextFinding("This is a confidential document with secret information.", - "confidential", false, false, - new String[]{"confidential"}, 1); + testTextFinding( + "This is a confidential document with secret information.", + "confidential", + false, + false, + new String[] {"confidential"}, + 1); } @Test @DisplayName("Should perform case-insensitive search") void performCaseInsensitiveSearch() throws IOException { - testTextFinding("This document contains CONFIDENTIAL information.", - "confidential", false, false, - new String[]{"CONFIDENTIAL"}, 1); + testTextFinding( + "This document contains CONFIDENTIAL information.", + "confidential", + false, + false, + new String[] {"CONFIDENTIAL"}, + 1); } @Test @DisplayName("Should find multiple occurrences of same term") void findMultipleOccurrences() throws IOException { - testTextFinding("The secret code is secret123. Keep this secret safe!", - "secret", false, false, - new String[]{"secret", "secret", "secret"}, 3); + testTextFinding( + "The secret code is secret123. Keep this secret safe!", + "secret", + false, + false, + new String[] {"secret", "secret", "secret"}, + 3); } @Test @@ -131,33 +155,49 @@ class TextFinderTest { @Test @DisplayName("Should find only whole words when enabled") void findOnlyWholeWords() throws IOException { - testTextFinding("This is a test testing document with tested results.", - "test", false, true, - new String[]{"test"}, 1); + testTextFinding( + "This is a test testing document with tested results.", + "test", + false, + true, + new String[] {"test"}, + 1); } @Test @DisplayName("Should find partial matches when whole word search disabled") void findPartialMatches() throws IOException { - testTextFinding("This is a test testing document with tested results.", - "test", false, false, - new String[]{"test", "test", "test"}, 3); + testTextFinding( + "This is a test testing document with tested results.", + "test", + false, + false, + new String[] {"test", "test", "test"}, + 3); } @Test @DisplayName("Should handle punctuation boundaries correctly") void handlePunctuationBoundaries() throws IOException { - testTextFinding("Hello, world! Testing: test-case (test).", - "test", false, true, - new String[]{"test"}, 2); // Both standalone "test" and "test" in "test-case" + testTextFinding( + "Hello, world! Testing: test-case (test).", + "test", + false, + true, + new String[] {"test"}, + 2); // Both standalone "test" and "test" in "test-case" } @Test @DisplayName("Should handle word boundaries with special characters") void handleSpecialCharacterBoundaries() throws IOException { - testTextFinding("Email: test@example.com and test.txt file", - "test", false, true, - new String[]{"test"}, 2); // Both in email and filename should match + testTextFinding( + "Email: test@example.com and test.txt file", + "test", + false, + true, + new String[] {"test"}, + 2); // Both in email and filename should match } } @@ -168,46 +208,64 @@ class TextFinderTest { @Test @DisplayName("Should find text matching regex pattern") void findTextMatchingRegex() throws IOException { - testTextFinding("Contact John at 123-45-6789 or Jane at 987-65-4321 for details.", - "\\d{3}-\\d{2}-\\d{4}", true, false, - new String[]{"123-45-6789", "987-65-4321"}, 2); + testTextFinding( + "Contact John at 123-45-6789 or Jane at 987-65-4321 for details.", + "\\d{3}-\\d{2}-\\d{4}", + true, + false, + new String[] {"123-45-6789", "987-65-4321"}, + 2); } @Test @DisplayName("Should find email addresses with regex") void findEmailAddresses() throws IOException { - testTextFinding("Email: test@example.com and admin@test.org", - "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false, - new String[]{"test@example.com", "admin@test.org"}, 2); + testTextFinding( + "Email: test@example.com and admin@test.org", + "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", + true, + false, + new String[] {"test@example.com", "admin@test.org"}, + 2); } @Test @DisplayName("Should combine regex with whole word search") void combineRegexWithWholeWord() throws IOException { - testTextFinding("Email: test@example.com and admin@test.org", - "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, true, - new String[]{"test@example.com", "admin@test.org"}, 2); + testTextFinding( + "Email: test@example.com and admin@test.org", + "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", + true, + true, + new String[] {"test@example.com", "admin@test.org"}, + 2); } @Test @DisplayName("Should find currency patterns") void findCurrencyPatterns() throws IOException { - testTextFinding("Price: $100.50 and €75.25", - "\\$\\d+\\.\\d{2}", true, false, - new String[]{"$100.50"}, 1); + testTextFinding( + "Price: $100.50 and €75.25", + "\\$\\d+\\.\\d{2}", + true, + false, + new String[] {"$100.50"}, + 1); } @ParameterizedTest - @ValueSource(strings = { - "\\d{4}-\\d{2}-\\d{2}", // Date pattern - "\\b[A-Z]{2,}\\b", // Uppercase words - "\\w+@\\w+\\.\\w+", // Simple email pattern - "\\$\\d+", // Simple currency - "\\b\\d{3,4}\\b" // 3-4 digit numbers - }) + @ValueSource( + strings = { + "\\d{4}-\\d{2}-\\d{2}", // Date pattern + "\\b[A-Z]{2,}\\b", // Uppercase words + "\\w+@\\w+\\.\\w+", // Simple email pattern + "\\$\\d+", // Simple currency + "\\b\\d{3,4}\\b" // 3-4 digit numbers + }) @DisplayName("Should handle various regex patterns") void handleVariousRegexPatterns(String regexPattern) throws IOException { - String testContent = "Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234"; + String testContent = + "Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234"; addTextToPage(testContent); TextFinder textFinder = new TextFinder(regexPattern, true, false); @@ -215,7 +273,9 @@ class TextFinderTest { List foundTexts = textFinder.getFoundTexts(); // Each pattern should find at least one match in our test content - assertFalse(foundTexts.isEmpty(), String.format("Pattern '%s' should find at least one match", regexPattern)); + assertFalse( + foundTexts.isEmpty(), + String.format("Pattern '%s' should find at least one match", regexPattern)); } @Test @@ -230,9 +290,10 @@ class TextFinderTest { assertNotNull(foundTexts); } catch (java.util.regex.PatternSyntaxException e) { assertNotNull(e.getMessage()); - assertTrue(e.getMessage().contains("Unclosed character class") || - e.getMessage().contains("syntax"), - "Exception should indicate regex syntax error"); + assertTrue( + e.getMessage().contains("Unclosed character class") + || e.getMessage().contains("syntax"), + "Exception should indicate regex syntax error"); } catch (RuntimeException | IOException e) { assertNotNull(e.getMessage()); } @@ -246,33 +307,38 @@ class TextFinderTest { @Test @DisplayName("Should handle international characters") void handleInternationalCharacters() throws IOException { - testTextFinding("Hello café naïve résumé", - "café", false, false, - new String[]{"café"}, 1); + testTextFinding( + "Hello café naïve résumé", "café", false, false, new String[] {"café"}, 1); } @Test @DisplayName("Should find text with accented characters") void findAccentedCharacters() throws IOException { - testTextFinding("Café, naïve, résumé, piñata", - "café", false, false, - new String[]{"Café"}, 1); // Case insensitive + testTextFinding( + "Café, naïve, résumé, piñata", + "café", + false, + false, + new String[] {"Café"}, + 1); // Case insensitive } @Test @DisplayName("Should handle special symbols") void handleSpecialSymbols() throws IOException { - testTextFinding("Symbols: © ® ™ ± × ÷ § ¶", - "©", false, false, - new String[]{"©"}, 1); + testTextFinding("Symbols: © ® ™ ± × ÷ § ¶", "©", false, false, new String[] {"©"}, 1); } @Test @DisplayName("Should find currency symbols") void findCurrencySymbols() throws IOException { - testTextFinding("Prices: $100 €75 £50 ¥1000", - "[€£¥]", true, false, - new String[]{"€", "£", "¥"}, 3); + testTextFinding( + "Prices: $100 €75 £50 ¥1000", + "[€£¥]", + true, + false, + new String[] {"€", "£", "¥"}, + 3); } } @@ -330,7 +396,7 @@ class TextFinderTest { String longTerm = "a".repeat(1000); String content = "Short text with " + longTerm + " embedded."; - testTextFinding(content, longTerm, false, false, new String[]{longTerm}, 1); + testTextFinding(content, longTerm, false, false, new String[] {longTerm}, 1); } @Test @@ -350,8 +416,9 @@ class TextFinderTest { long endTime = System.currentTimeMillis(); assertEquals(10, foundTexts.size()); - assertTrue(endTime - startTime < 3000, - "Multi-page search should complete within 3 seconds"); + assertTrue( + endTime - startTime < 3000, + "Multi-page search should complete within 3 seconds"); } } @@ -402,12 +469,13 @@ class TextFinderTest { String complexRegex = "(?=.*\\d)(?=.*[a-z])(?=.*[A-Z])[a-zA-Z\\d]{6}"; - assertDoesNotThrow(() -> { - TextFinder textFinder = new TextFinder(complexRegex, true, false); - textFinder.getText(document); - List foundTexts = textFinder.getFoundTexts(); - assertNotNull(foundTexts); - }); + assertDoesNotThrow( + () -> { + TextFinder textFinder = new TextFinder(complexRegex, true, false); + textFinder.getText(document); + List foundTexts = textFinder.getFoundTexts(); + assertNotNull(foundTexts); + }); } @ParameterizedTest @@ -464,10 +532,11 @@ class TextFinderTest { List foundTexts = textFinder.getFoundTexts(); assertFalse(foundTexts.isEmpty()); - foundTexts.forEach(text -> { - assertNotNull(text.getText()); - assertTrue(text.getX1() >= 0 && text.getY1() >= 0); - }); + foundTexts.forEach( + text -> { + assertNotNull(text.getText()); + assertTrue(text.getX1() >= 0 && text.getY1() >= 0); + }); } } @@ -485,8 +554,10 @@ class TextFinderTest { textFinder.getText(document); List foundTexts = textFinder.getFoundTexts(); - assertEquals(1, foundTexts.size(), - "Should find exactly one standalone '1', not the ones embedded in other numbers/codes"); + assertEquals( + 1, + foundTexts.size(), + "Should find exactly one standalone '1', not the ones embedded in other numbers/codes"); assertEquals("1", foundTexts.get(0).getText()); } @@ -500,14 +571,16 @@ class TextFinderTest { textFinder.getText(document); List foundTexts = textFinder.getFoundTexts(); - assertTrue(foundTexts.size() >= 3, - "Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'"); + assertTrue( + foundTexts.size() >= 3, + "Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'"); } @Test @DisplayName("Should find single characters in various contexts") void findSingleCharacters() throws IOException { - String content = "Grade: A. Section B has item A-1. The letter A appears multiple times."; + String content = + "Grade: A. Section B has item A-1. The letter A appears multiple times."; addTextToPage(content); TextFinder textFinder = new TextFinder("A", false, true); @@ -522,24 +595,29 @@ class TextFinderTest { } @Test - @DisplayName("Should handle digits at word boundaries correctly") + @DisplayName("Digits as strict standalone tokens (exclude decimals and suffixes)") void findDigitsAtWordBoundaries() throws IOException { - String content = "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2."; + String content = + "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2. Price: 2,50€"; addTextToPage(content); TextFinder textFinder1 = new TextFinder("1", false, true); textFinder1.getText(document); List foundTexts1 = textFinder1.getFoundTexts(); - assertEquals(1, foundTexts1.size(), - "Should find only the standalone '1' at the beginning"); + assertEquals( + 1, + foundTexts1.size(), + "Should find only the standalone '1'; do not count the '1' in '1.0' or in 'Item1'."); TextFinder textFinder2 = new TextFinder("2", false, true); textFinder2.getText(document); List foundTexts2 = textFinder2.getFoundTexts(); - assertEquals(1, foundTexts2.size(), - "Should find only the standalone '2' in the number list"); + assertEquals( + 1, + foundTexts2.size(), + "Should find only the standalone '2' in the number list"); } @Test @@ -566,8 +644,10 @@ class TextFinderTest { textFinder.getText(document); List foundTexts = textFinder.getFoundTexts(); - assertEquals(2, foundTexts.size(), - "Should find both '1' instances despite spacing variations"); + assertEquals( + 2, + foundTexts.size(), + "Should find both '1' instances despite spacing variations"); } }