refactor(tests): move & expand TextFinder/RedactController tests; fix TextFinder empty search-term handling; update token filtering API (#4264)

# Description of Changes

- **What was changed**
  - Relocated and refactored unit tests:
- `TextFinderTest` and `RedactControllerTest` moved under
`app/core/src/test/...` to align with module structure.
- Expanded test coverage: whole-word vs. partial matches, complex
regexes (emails, SSNs, IPs, currency), international/accented
characters, multi-page documents, malformed PDFs, operator preservation,
color decoding, and performance assertions.
  - **API adjustments in redaction flow**:
- `createTokensWithoutTargetText(...)` now accepts the `PDDocument`
alongside `PDPage` to properly manage resources/streams.
- Introduced/used `createPlaceholderWithFont(...)` to maintain text
width with explicit font context.
  - **Bug fix in `TextFinder`**:
- Early-return when the (trimmed) search term is empty to prevent
unnecessary processing and avoid false positives/errors.
- Minor cleanup (removed redundant `super()` call) and improved guard
logic around regex/whole-word wrapping.

- **Why the change was made**
- Improve reliability and determinism of PDF redaction and text finding
by exercising real-world patterns and edge cases.
- Ensure structural PDF operators (graphics/positioning) are preserved
during token filtering.
- Prevent crashes or misleading matches when users provide
empty/whitespace-only search terms.
- Align tests with the current project layout and increase
maintainability.

---

## Checklist

### General

- [x] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [x] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [x] I have performed a self-review of my own code
- [x] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Ludy 2025-08-24 22:20:28 +02:00 committed by GitHub
parent 2baa258e11
commit 9779c75df4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 419 additions and 234 deletions

View File

@ -27,7 +27,6 @@ public class TextFinder extends PDFTextStripper {
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
throws IOException {
super();
this.searchTerm = searchTerm;
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
@ -68,11 +67,15 @@ public class TextFinder extends PDFTextStripper {
}
String processedSearchTerm = this.searchTerm.trim();
if (processedSearchTerm.isEmpty()) {
super.endPage(page);
return;
}
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
if (this.wholeWordSearch) {
if (processedSearchTerm.length() == 1
&& Character.isDigit(processedSearchTerm.charAt(0))) {
regex = "(?<![\\w])" + regex + "(?![\\w])";
regex = "(?<![\\w])(?<!\\d[\\.,])" + regex + "(?![\\w])(?![\\.,]\\d)";
} else if (processedSearchTerm.length() == 1) {
regex = "(?<![\\w])" + regex + "(?![\\w])";
} else {

View File

@ -57,11 +57,9 @@ class RedactControllerTest {
private static final Logger log = LoggerFactory.getLogger(RedactControllerTest.class);
@Mock
private CustomPDFDocumentFactory pdfDocumentFactory;
@Mock private CustomPDFDocumentFactory pdfDocumentFactory;
@InjectMocks
private RedactController redactController;
@InjectMocks private RedactController redactController;
private MockMultipartFile mockPdfFile;
private PDDocument mockDocument;
@ -72,9 +70,15 @@ class RedactControllerTest {
private PDPage realPage;
// Helpers
private void testAutoRedaction(String searchText, boolean useRegex, boolean wholeWordSearch,
String redactColor, float padding, boolean convertToImage,
boolean expectSuccess) throws Exception {
private void testAutoRedaction(
String searchText,
boolean useRegex,
boolean wholeWordSearch,
String redactColor,
float padding,
boolean convertToImage,
boolean expectSuccess)
throws Exception {
RedactPdfRequest request = createRedactPdfRequest();
request.setListOfText(searchText);
request.setUseRegex(useRegex);
@ -103,7 +107,8 @@ class RedactControllerTest {
}
}
private void testManualRedaction(List<RedactionArea> redactionAreas, boolean convertToImage) throws Exception {
private void testManualRedaction(List<RedactionArea> redactionAreas, boolean convertToImage)
throws Exception {
ManualRedactPdfRequest request = createManualRedactPdfRequest();
request.setRedactions(redactionAreas);
request.setConvertPDFToImage(convertToImage);
@ -123,18 +128,16 @@ class RedactControllerTest {
@BeforeEach
void setUp() throws IOException {
mockPdfFile = new MockMultipartFile(
"fileInput",
"test.pdf",
"application/pdf",
createSimplePdfContent()
);
mockPdfFile =
new MockMultipartFile(
"fileInput", "test.pdf", "application/pdf", createSimplePdfContent());
// Mock PDF document and related objects
mockDocument = mock(PDDocument.class);
mockPages = mock(PDPageTree.class);
mockPage = mock(PDPage.class);
org.apache.pdfbox.pdmodel.PDDocumentCatalog mockCatalog = mock(org.apache.pdfbox.pdmodel.PDDocumentCatalog.class);
org.apache.pdfbox.pdmodel.PDDocumentCatalog mockCatalog =
mock(org.apache.pdfbox.pdmodel.PDDocumentCatalog.class);
// Setup document structure properly
when(pdfDocumentFactory.load(any(MockMultipartFile.class))).thenReturn(mockDocument);
@ -153,12 +156,14 @@ class RedactControllerTest {
when(mockPage.getMediaBox()).thenReturn(pageRect);
when(mockPage.getBBox()).thenReturn(pageRect);
InputStream mockInputStream = new ByteArrayInputStream("BT /F1 12 Tf 100 200 Td (test content) Tj ET".getBytes());
InputStream mockInputStream =
new ByteArrayInputStream("BT /F1 12 Tf 100 200 Td (test content) Tj ET".getBytes());
when(mockPage.getContents()).thenReturn(mockInputStream);
when(mockPage.hasContents()).thenReturn(true);
org.apache.pdfbox.cos.COSDocument mockCOSDocument = mock(org.apache.pdfbox.cos.COSDocument.class);
org.apache.pdfbox.cos.COSDocument mockCOSDocument =
mock(org.apache.pdfbox.cos.COSDocument.class);
org.apache.pdfbox.cos.COSStream mockCOSStream = mock(org.apache.pdfbox.cos.COSStream.class);
when(mockDocument.getDocument()).thenReturn(mockCOSDocument);
when(mockCOSDocument.createCOSStream()).thenReturn(mockCOSStream);
@ -167,11 +172,14 @@ class RedactControllerTest {
when(mockCOSStream.createOutputStream()).thenReturn(mockOutputStream);
when(mockCOSStream.createOutputStream(any())).thenReturn(mockOutputStream);
doAnswer(invocation -> {
ByteArrayOutputStream baos = invocation.getArgument(0);
baos.write("Mock PDF Content".getBytes());
return null;
}).when(mockDocument).save(any(ByteArrayOutputStream.class));
doAnswer(
invocation -> {
ByteArrayOutputStream baos = invocation.getArgument(0);
baos.write("Mock PDF Content".getBytes());
return null;
})
.when(mockDocument)
.save(any(ByteArrayOutputStream.class));
doNothing().when(mockDocument).close();
// Initialize a real document for unit tests
@ -185,7 +193,8 @@ class RedactControllerTest {
// Set up basic page resources
PDResources resources = new PDResources();
resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
resources.put(
COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.setResources(resources);
}
@ -222,7 +231,14 @@ class RedactControllerTest {
@Test
@DisplayName("Should redact multiple search terms")
void redactMultipleSearchTerms() throws Exception {
testAutoRedaction("confidential\nsecret\nprivate\nclassified", false, true, "#FF0000", 2.0f, false, true);
testAutoRedaction(
"confidential\nsecret\nprivate\nclassified",
false,
true,
"#FF0000",
2.0f,
false,
true);
}
@Test
@ -250,8 +266,12 @@ class RedactControllerTest {
when(page.getBBox()).thenReturn(pageRect);
when(page.hasContents()).thenReturn(true);
InputStream mockInputStream = new ByteArrayInputStream(
("BT /F1 12 Tf 100 200 Td (page " + i + " content with confidential info) Tj ET").getBytes());
InputStream mockInputStream =
new ByteArrayInputStream(
("BT /F1 12 Tf 100 200 Td (page "
+ i
+ " content with confidential info) Tj ET")
.getBytes());
when(page.getContents()).thenReturn(mockInputStream);
pageList.add(page);
@ -285,7 +305,8 @@ class RedactControllerTest {
when(mockPages.get(0)).thenReturn(mockPage);
org.apache.pdfbox.pdmodel.PDDocumentInformation mockInfo = mock(org.apache.pdfbox.pdmodel.PDDocumentInformation.class);
org.apache.pdfbox.pdmodel.PDDocumentInformation mockInfo =
mock(org.apache.pdfbox.pdmodel.PDDocumentInformation.class);
when(mockDocument.getDocumentInformation()).thenReturn(mockInfo);
ResponseEntity<byte[]> response = redactController.redactPdf(request);
@ -311,23 +332,27 @@ class RedactControllerTest {
@Test
@DisplayName("Should handle email pattern redaction")
void handleEmailPatternRedaction() throws Exception {
testAutoRedaction("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false, "#0000FF", 1.5f, false, true);
testAutoRedaction(
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
true, false, "#0000FF", 1.5f, false, true);
}
@Test
@DisplayName("Should handle phone number patterns")
void handlePhoneNumberPatterns() throws Exception {
testAutoRedaction("\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", true, false, "#FF0000", 1.0f, false, true);
testAutoRedaction(
"\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", true, false, "#FF0000", 1.0f, false, true);
}
@ParameterizedTest
@ValueSource(strings = {
"\\d{3}-\\d{2}-\\d{4}", // SSN pattern
"\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}", // Credit card pattern
"\\b[A-Z]{2,}\\b", // Uppercase words
"\\$\\d+\\.\\d{2}", // Currency pattern
"\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b" // IP address pattern
})
@ValueSource(
strings = {
"\\d{3}-\\d{2}-\\d{4}", // SSN pattern
"\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}", // Credit card pattern
"\\b[A-Z]{2,}\\b", // Uppercase words
"\\$\\d+\\.\\d{2}", // Currency pattern
"\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b" // IP address pattern
})
@DisplayName("Should handle various regex patterns")
void handleVariousRegexPatterns(String regexPattern) throws Exception {
testAutoRedaction(regexPattern, true, false, "#000000", 1.0f, false, true);
@ -519,8 +544,10 @@ class RedactControllerTest {
when(page.getBBox()).thenReturn(pageRect);
when(page.hasContents()).thenReturn(true);
InputStream mockInputStream = new ByteArrayInputStream(
("BT /F1 12 Tf 100 200 Td (page " + i + " content) Tj ET").getBytes());
InputStream mockInputStream =
new ByteArrayInputStream(
("BT /F1 12 Tf 100 200 Td (page " + i + " content) Tj ET")
.getBytes());
when(page.getContents()).thenReturn(mockInputStream);
pageList.add(page);
@ -588,36 +615,38 @@ class RedactControllerTest {
request.setFileInput(null);
request.setListOfText("test");
assertDoesNotThrow(() -> {
try {
redactController.redactPdf(request);
} catch (Exception e) {
assertNotNull(e);
}
});
assertDoesNotThrow(
() -> {
try {
redactController.redactPdf(request);
} catch (Exception e) {
assertNotNull(e);
}
});
}
@Test
@DisplayName("Should handle malformed PDF gracefully")
void handleMalformedPdfGracefully() throws Exception {
MockMultipartFile malformedFile = new MockMultipartFile(
"fileInput",
"malformed.pdf",
"application/pdf",
"Not a real PDF content".getBytes()
);
MockMultipartFile malformedFile =
new MockMultipartFile(
"fileInput",
"malformed.pdf",
"application/pdf",
"Not a real PDF content".getBytes());
RedactPdfRequest request = new RedactPdfRequest();
request.setFileInput(malformedFile);
request.setListOfText("test");
assertDoesNotThrow(() -> {
try {
redactController.redactPdf(request);
} catch (Exception e) {
assertNotNull(e);
}
});
assertDoesNotThrow(
() -> {
try {
redactController.redactPdf(request);
} catch (Exception e) {
assertNotNull(e);
}
});
}
@Test
@ -723,14 +752,24 @@ class RedactControllerTest {
}
@ParameterizedTest
@ValueSource(strings = {"#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00", "0000FF"})
@ValueSource(
strings = {
"#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00",
"0000FF"
})
@DisplayName("Should handle various valid color formats")
void handleVariousValidColorFormats(String colorInput) throws Exception {
Color result = redactController.decodeOrDefault(colorInput);
assertNotNull(result);
assertTrue(result.getRed() >= 0 && result.getRed() <= 255, "Red component should be in valid range");
assertTrue(result.getGreen() >= 0 && result.getGreen() <= 255, "Green component should be in valid range");
assertTrue(result.getBlue() >= 0 && result.getBlue() <= 255, "Blue component should be in valid range");
assertTrue(
result.getRed() >= 0 && result.getRed() <= 255,
"Red component should be in valid range");
assertTrue(
result.getGreen() >= 0 && result.getGreen() <= 255,
"Green component should be in valid range");
assertTrue(
result.getBlue() >= 0 && result.getBlue() <= 255,
"Blue component should be in valid range");
}
@Test
@ -755,16 +794,18 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("confidential");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false);
List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
assertNotNull(tokens);
assertFalse(tokens.isEmpty());
String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("confidential"),
"Target text should be replaced with placeholder");
assertTrue(reconstructedText.contains("document"),
"Non-target text should remain");
assertFalse(
reconstructedText.contains("confidential"),
"Target text should be replaced with placeholder");
assertTrue(reconstructedText.contains("document"), "Non-target text should remain");
}
@Test
@ -774,7 +815,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("secret");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false);
List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
assertNotNull(tokens);
@ -785,7 +828,9 @@ class RedactControllerTest {
if (array.getObject(i) instanceof COSString cosString) {
String text = cosString.getString();
if (text.contains("secret")) {
fail("Target text 'secret' should have been redacted from TJ array");
fail(
"Target text 'secret' should have been redacted from TJ"
+ " array");
}
foundModifiedTJArray = true;
}
@ -803,21 +848,33 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("redact");
List<Object> originalTokens = getOriginalTokens();
List<Object> filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false);
List<Object> filteredTokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
long originalNonTextCount = originalTokens.stream()
.filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName()))
.count();
long originalNonTextCount =
originalTokens.stream()
.filter(
token ->
token instanceof Operator op
&& !redactController.isTextShowingOperator(
op.getName()))
.count();
long filteredNonTextCount = filteredTokens.stream()
.filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName()))
.count();
long filteredNonTextCount =
filteredTokens.stream()
.filter(
token ->
token instanceof Operator op
&& !redactController.isTextShowingOperator(
op.getName()))
.count();
assertTrue(filteredNonTextCount > 0,
"Non-text operators should be preserved");
assertTrue(filteredNonTextCount > 0, "Non-text operators should be preserved");
assertTrue(filteredNonTextCount >= originalNonTextCount / 2,
"A reasonable number of non-text operators should be preserved");
assertTrue(
filteredNonTextCount >= originalNonTextCount / 2,
"A reasonable number of non-text operators should be preserved");
}
@Test
@ -827,7 +884,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("\\d{3}-\\d{2}-\\d{4}"); // SSN pattern
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, true, false);
List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, true, false);
String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("111-22-3333"), "SSN should be redacted");
@ -841,7 +900,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("test");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, true);
List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, true);
String reconstructedText = extractTextFromTokens(tokens);
assertTrue(reconstructedText.contains("testing"), "Partial matches should remain");
@ -856,11 +917,14 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("sensitive");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false);
List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("sensitive"),
"Text should be redacted regardless of operator type");
assertFalse(
reconstructedText.contains("sensitive"),
"Text should be redacted regardless of operator type");
}
@Test
@ -884,7 +948,10 @@ class RedactControllerTest {
void shouldHandleEmptyTokenList() throws Exception {
List<Object> emptyTokens = Collections.emptyList();
assertDoesNotThrow(() -> redactController.writeFilteredContentStream(realDocument, realPage, emptyTokens));
assertDoesNotThrow(
() ->
redactController.writeFilteredContentStream(
realDocument, realPage, emptyTokens));
assertNotNull(realPage.getContents(), "Page should still have content stream");
}
@ -906,20 +973,27 @@ class RedactControllerTest {
@DisplayName("Placeholder creation should maintain text width")
void shouldCreateWidthMatchingPlaceholder() throws Exception {
String originalText = "confidential";
String placeholder = redactController.createPlaceholder(originalText);
String placeholder =
redactController.createPlaceholderWithFont(
originalText, new PDType1Font(Standard14Fonts.FontName.HELVETICA));
assertEquals(originalText.length(), placeholder.length(),
"Placeholder should maintain character count for width preservation");
assertEquals(
originalText.length(),
placeholder.length(),
"Placeholder should maintain character count for width preservation");
}
@Test
@DisplayName("Placeholder should handle special characters")
void shouldHandleSpecialCharactersInPlaceholder() throws Exception {
String originalText = "café naïve";
String placeholder = redactController.createPlaceholder(originalText);
String placeholder =
redactController.createPlaceholderWithFont(
originalText, new PDType1Font(Standard14Fonts.FontName.HELVETICA));
assertEquals(originalText.length(), placeholder.length());
assertFalse(placeholder.contains("café"), "Placeholder should not contain original text");
assertFalse(
placeholder.contains("café"), "Placeholder should not contain original text");
}
@Test
@ -929,7 +1003,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("secret");
List<Object> filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false);
List<Object> filteredTokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
redactController.writeFilteredContentStream(realDocument, realPage, filteredTokens);
assertNotNull(realPage.getContents());
@ -946,15 +1022,21 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("confidential");
List<Object> filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false);
List<Object> filteredTokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
long filteredPositioning = filteredTokens.stream()
.filter(token -> token instanceof Operator op &&
(op.getName().equals("Td") || op.getName().equals("TD") || op.getName().equals("Tm")))
.count();
long filteredPositioning =
filteredTokens.stream()
.filter(
token ->
token instanceof Operator op
&& ("Td".equals(op.getName())
|| "TD".equals(op.getName())
|| "Tm".equals(op.getName())))
.count();
assertTrue(filteredPositioning > 0,
"Positioning operators should be preserved");
assertTrue(filteredPositioning > 0, "Positioning operators should be preserved");
}
@Test
@ -966,16 +1048,21 @@ class RedactControllerTest {
}
realDocument.addPage(realPage);
realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.getResources()
.put(
COSName.getPDFName("F1"),
new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
try (PDPageContentStream contentStream =
new PDPageContentStream(realDocument, realPage)) {
contentStream.setLineWidth(2);
contentStream.moveTo(100, 100);
contentStream.lineTo(200, 200);
contentStream.stroke();
contentStream.beginText();
contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12);
contentStream.setFont(
realPage.getResources().getFont(COSName.getPDFName("F1")), 12);
contentStream.newLineAtOffset(50, 750);
contentStream.showText("This is a complex document with ");
contentStream.setTextRise(5);
@ -990,19 +1077,27 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("confidential");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false);
List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
assertNotNull(tokens);
assertFalse(tokens.isEmpty());
String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("confidential"), "Target text should be redacted");
assertFalse(
reconstructedText.contains("confidential"), "Target text should be redacted");
boolean hasGraphicsOperators = tokens.stream()
.anyMatch(token -> token instanceof Operator op &&
(op.getName().equals("re") || op.getName().equals("f") ||
op.getName().equals("m") || op.getName().equals("l") ||
op.getName().equals("S")));
boolean hasGraphicsOperators =
tokens.stream()
.anyMatch(
token ->
token instanceof Operator op
&& ("re".equals(op.getName())
|| "f".equals(op.getName())
|| "m".equals(op.getName())
|| "l".equals(op.getName())
|| "S".equals(op.getName())));
assertTrue(hasGraphicsOperators, "Graphics operators should be preserved");
}
@ -1019,10 +1114,12 @@ class RedactControllerTest {
// Create resources
PDResources resources = new PDResources();
resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
resources.put(
COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.setResources(resources);
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
try (PDPageContentStream contentStream =
new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText();
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12);
contentStream.newLineAtOffset(50, 750);
@ -1180,7 +1277,8 @@ class RedactControllerTest {
}
realDocument.addPage(realPage);
realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText();
@ -1198,7 +1296,8 @@ class RedactControllerTest {
}
realDocument.addPage(realPage);
realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText();
@ -1221,7 +1320,8 @@ class RedactControllerTest {
}
realDocument.addPage(realPage);
realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.setLineWidth(2);
@ -1248,7 +1348,8 @@ class RedactControllerTest {
}
realDocument.addPage(realPage);
realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText();
@ -1266,28 +1367,29 @@ class RedactControllerTest {
// Helper for token creation
private List<Object> createSampleTokenList() {
return List.of(
Operator.getOperator("BT"),
COSName.getPDFName("F1"),
new COSFloat(12),
Operator.getOperator("Tf"),
new COSString("Sample text"),
Operator.getOperator("Tj"),
Operator.getOperator("ET")
);
Operator.getOperator("BT"),
COSName.getPDFName("F1"),
new COSFloat(12),
Operator.getOperator("Tf"),
new COSString("Sample text"),
Operator.getOperator("Tj"),
Operator.getOperator("ET"));
}
private List<Object> getOriginalTokens() throws Exception {
// Create a new page to avoid side effects from other tests
PDPage pageForTokenExtraction = new PDPage(PDRectangle.A4);
pageForTokenExtraction.setResources(realPage.getResources());
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, pageForTokenExtraction)) {
contentStream.beginText();
contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12);
contentStream.newLineAtOffset(50, 750);
contentStream.showText("Original content");
contentStream.endText();
try (PDPageContentStream contentStream =
new PDPageContentStream(realDocument, pageForTokenExtraction)) {
contentStream.beginText();
contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12);
contentStream.newLineAtOffset(50, 750);
contentStream.showText("Original content");
contentStream.endText();
}
return redactController.createTokensWithoutTargetText(pageForTokenExtraction, Collections.emptySet(), false, false);
return redactController.createTokensWithoutTargetText(
realDocument, pageForTokenExtraction, Collections.emptySet(), false, false);
}
private String extractTextFromTokens(List<Object> tokens) {

View File

@ -1,5 +1,11 @@
package stirling.software.SPDF.pdf;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.util.List;
@ -10,11 +16,6 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.junit.jupiter.api.AfterEach;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
@ -34,33 +35,44 @@ class TextFinderTest {
private PDPage page;
// Helpers
private void testTextFinding(String pageContent, String searchTerm, boolean useRegex, boolean wholeWord,
String[] expectedTexts, int expectedCount) throws IOException {
private void testTextFinding(
String pageContent,
String searchTerm,
boolean useRegex,
boolean wholeWord,
String[] expectedTexts,
int expectedCount)
throws IOException {
addTextToPage(pageContent);
TextFinder textFinder = new TextFinder(searchTerm, useRegex, wholeWord);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(expectedCount, foundTexts.size(),
String.format("Expected %d matches for search term '%s'", expectedCount, searchTerm));
assertEquals(
expectedCount,
foundTexts.size(),
String.format(
"Expected %d matches for search term '%s'", expectedCount, searchTerm));
if (expectedTexts != null) {
for (String expectedText : expectedTexts) {
assertTrue(foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)),
String.format("Expected to find text: '%s'", expectedText));
assertTrue(
foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)),
String.format("Expected to find text: '%s'", expectedText));
}
}
// Verify basic properties of found texts
foundTexts.forEach(text -> {
assertNotNull(text.getText());
assertTrue(text.getX1() >= 0);
assertTrue(text.getY1() >= 0);
assertTrue(text.getX2() >= text.getX1());
assertTrue(text.getY2() >= text.getY1());
assertEquals(0, text.getPageIndex()); // Single page test
});
foundTexts.forEach(
text -> {
assertNotNull(text.getText());
assertTrue(text.getX1() >= 0);
assertTrue(text.getY1() >= 0);
assertTrue(text.getX2() >= text.getX1());
assertTrue(text.getY2() >= text.getY1());
assertEquals(0, text.getPageIndex()); // Single page test
});
}
@BeforeEach
@ -84,25 +96,37 @@ class TextFinderTest {
@Test
@DisplayName("Should find simple text correctly")
void findSimpleText() throws IOException {
testTextFinding("This is a confidential document with secret information.",
"confidential", false, false,
new String[]{"confidential"}, 1);
testTextFinding(
"This is a confidential document with secret information.",
"confidential",
false,
false,
new String[] {"confidential"},
1);
}
@Test
@DisplayName("Should perform case-insensitive search")
void performCaseInsensitiveSearch() throws IOException {
testTextFinding("This document contains CONFIDENTIAL information.",
"confidential", false, false,
new String[]{"CONFIDENTIAL"}, 1);
testTextFinding(
"This document contains CONFIDENTIAL information.",
"confidential",
false,
false,
new String[] {"CONFIDENTIAL"},
1);
}
@Test
@DisplayName("Should find multiple occurrences of same term")
void findMultipleOccurrences() throws IOException {
testTextFinding("The secret code is secret123. Keep this secret safe!",
"secret", false, false,
new String[]{"secret", "secret", "secret"}, 3);
testTextFinding(
"The secret code is secret123. Keep this secret safe!",
"secret",
false,
false,
new String[] {"secret", "secret", "secret"},
3);
}
@Test
@ -131,33 +155,49 @@ class TextFinderTest {
@Test
@DisplayName("Should find only whole words when enabled")
void findOnlyWholeWords() throws IOException {
testTextFinding("This is a test testing document with tested results.",
"test", false, true,
new String[]{"test"}, 1);
testTextFinding(
"This is a test testing document with tested results.",
"test",
false,
true,
new String[] {"test"},
1);
}
@Test
@DisplayName("Should find partial matches when whole word search disabled")
void findPartialMatches() throws IOException {
testTextFinding("This is a test testing document with tested results.",
"test", false, false,
new String[]{"test", "test", "test"}, 3);
testTextFinding(
"This is a test testing document with tested results.",
"test",
false,
false,
new String[] {"test", "test", "test"},
3);
}
@Test
@DisplayName("Should handle punctuation boundaries correctly")
void handlePunctuationBoundaries() throws IOException {
testTextFinding("Hello, world! Testing: test-case (test).",
"test", false, true,
new String[]{"test"}, 2); // Both standalone "test" and "test" in "test-case"
testTextFinding(
"Hello, world! Testing: test-case (test).",
"test",
false,
true,
new String[] {"test"},
2); // Both standalone "test" and "test" in "test-case"
}
@Test
@DisplayName("Should handle word boundaries with special characters")
void handleSpecialCharacterBoundaries() throws IOException {
testTextFinding("Email: test@example.com and test.txt file",
"test", false, true,
new String[]{"test"}, 2); // Both in email and filename should match
testTextFinding(
"Email: test@example.com and test.txt file",
"test",
false,
true,
new String[] {"test"},
2); // Both in email and filename should match
}
}
@ -168,46 +208,64 @@ class TextFinderTest {
@Test
@DisplayName("Should find text matching regex pattern")
void findTextMatchingRegex() throws IOException {
testTextFinding("Contact John at 123-45-6789 or Jane at 987-65-4321 for details.",
"\\d{3}-\\d{2}-\\d{4}", true, false,
new String[]{"123-45-6789", "987-65-4321"}, 2);
testTextFinding(
"Contact John at 123-45-6789 or Jane at 987-65-4321 for details.",
"\\d{3}-\\d{2}-\\d{4}",
true,
false,
new String[] {"123-45-6789", "987-65-4321"},
2);
}
@Test
@DisplayName("Should find email addresses with regex")
void findEmailAddresses() throws IOException {
testTextFinding("Email: test@example.com and admin@test.org",
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false,
new String[]{"test@example.com", "admin@test.org"}, 2);
testTextFinding(
"Email: test@example.com and admin@test.org",
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
true,
false,
new String[] {"test@example.com", "admin@test.org"},
2);
}
@Test
@DisplayName("Should combine regex with whole word search")
void combineRegexWithWholeWord() throws IOException {
testTextFinding("Email: test@example.com and admin@test.org",
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, true,
new String[]{"test@example.com", "admin@test.org"}, 2);
testTextFinding(
"Email: test@example.com and admin@test.org",
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
true,
true,
new String[] {"test@example.com", "admin@test.org"},
2);
}
@Test
@DisplayName("Should find currency patterns")
void findCurrencyPatterns() throws IOException {
testTextFinding("Price: $100.50 and €75.25",
"\\$\\d+\\.\\d{2}", true, false,
new String[]{"$100.50"}, 1);
testTextFinding(
"Price: $100.50 and €75.25",
"\\$\\d+\\.\\d{2}",
true,
false,
new String[] {"$100.50"},
1);
}
@ParameterizedTest
@ValueSource(strings = {
"\\d{4}-\\d{2}-\\d{2}", // Date pattern
"\\b[A-Z]{2,}\\b", // Uppercase words
"\\w+@\\w+\\.\\w+", // Simple email pattern
"\\$\\d+", // Simple currency
"\\b\\d{3,4}\\b" // 3-4 digit numbers
})
@ValueSource(
strings = {
"\\d{4}-\\d{2}-\\d{2}", // Date pattern
"\\b[A-Z]{2,}\\b", // Uppercase words
"\\w+@\\w+\\.\\w+", // Simple email pattern
"\\$\\d+", // Simple currency
"\\b\\d{3,4}\\b" // 3-4 digit numbers
})
@DisplayName("Should handle various regex patterns")
void handleVariousRegexPatterns(String regexPattern) throws IOException {
String testContent = "Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234";
String testContent =
"Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234";
addTextToPage(testContent);
TextFinder textFinder = new TextFinder(regexPattern, true, false);
@ -215,7 +273,9 @@ class TextFinderTest {
List<PDFText> foundTexts = textFinder.getFoundTexts();
// Each pattern should find at least one match in our test content
assertFalse(foundTexts.isEmpty(), String.format("Pattern '%s' should find at least one match", regexPattern));
assertFalse(
foundTexts.isEmpty(),
String.format("Pattern '%s' should find at least one match", regexPattern));
}
@Test
@ -230,9 +290,10 @@ class TextFinderTest {
assertNotNull(foundTexts);
} catch (java.util.regex.PatternSyntaxException e) {
assertNotNull(e.getMessage());
assertTrue(e.getMessage().contains("Unclosed character class") ||
e.getMessage().contains("syntax"),
"Exception should indicate regex syntax error");
assertTrue(
e.getMessage().contains("Unclosed character class")
|| e.getMessage().contains("syntax"),
"Exception should indicate regex syntax error");
} catch (RuntimeException | IOException e) {
assertNotNull(e.getMessage());
}
@ -246,33 +307,38 @@ class TextFinderTest {
@Test
@DisplayName("Should handle international characters")
void handleInternationalCharacters() throws IOException {
testTextFinding("Hello café naïve résumé",
"café", false, false,
new String[]{"café"}, 1);
testTextFinding(
"Hello café naïve résumé", "café", false, false, new String[] {"café"}, 1);
}
@Test
@DisplayName("Should find text with accented characters")
void findAccentedCharacters() throws IOException {
testTextFinding("Café, naïve, résumé, piñata",
"café", false, false,
new String[]{"Café"}, 1); // Case insensitive
testTextFinding(
"Café, naïve, résumé, piñata",
"café",
false,
false,
new String[] {"Café"},
1); // Case insensitive
}
@Test
@DisplayName("Should handle special symbols")
void handleSpecialSymbols() throws IOException {
testTextFinding("Symbols: © ® ™ ± × ÷ § ¶",
"©", false, false,
new String[]{"©"}, 1);
testTextFinding("Symbols: © ® ™ ± × ÷ § ¶", "©", false, false, new String[] {"©"}, 1);
}
@Test
@DisplayName("Should find currency symbols")
void findCurrencySymbols() throws IOException {
testTextFinding("Prices: $100 €75 £50 ¥1000",
"[€£¥]", true, false,
new String[]{"", "£", "¥"}, 3);
testTextFinding(
"Prices: $100 €75 £50 ¥1000",
"[€£¥]",
true,
false,
new String[] {"", "£", "¥"},
3);
}
}
@ -330,7 +396,7 @@ class TextFinderTest {
String longTerm = "a".repeat(1000);
String content = "Short text with " + longTerm + " embedded.";
testTextFinding(content, longTerm, false, false, new String[]{longTerm}, 1);
testTextFinding(content, longTerm, false, false, new String[] {longTerm}, 1);
}
@Test
@ -350,8 +416,9 @@ class TextFinderTest {
long endTime = System.currentTimeMillis();
assertEquals(10, foundTexts.size());
assertTrue(endTime - startTime < 3000,
"Multi-page search should complete within 3 seconds");
assertTrue(
endTime - startTime < 3000,
"Multi-page search should complete within 3 seconds");
}
}
@ -402,12 +469,13 @@ class TextFinderTest {
String complexRegex = "(?=.*\\d)(?=.*[a-z])(?=.*[A-Z])[a-zA-Z\\d]{6}";
assertDoesNotThrow(() -> {
TextFinder textFinder = new TextFinder(complexRegex, true, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertNotNull(foundTexts);
});
assertDoesNotThrow(
() -> {
TextFinder textFinder = new TextFinder(complexRegex, true, false);
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertNotNull(foundTexts);
});
}
@ParameterizedTest
@ -464,10 +532,11 @@ class TextFinderTest {
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertFalse(foundTexts.isEmpty());
foundTexts.forEach(text -> {
assertNotNull(text.getText());
assertTrue(text.getX1() >= 0 && text.getY1() >= 0);
});
foundTexts.forEach(
text -> {
assertNotNull(text.getText());
assertTrue(text.getX1() >= 0 && text.getY1() >= 0);
});
}
}
@ -485,8 +554,10 @@ class TextFinderTest {
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size(),
"Should find exactly one standalone '1', not the ones embedded in other numbers/codes");
assertEquals(
1,
foundTexts.size(),
"Should find exactly one standalone '1', not the ones embedded in other numbers/codes");
assertEquals("1", foundTexts.get(0).getText());
}
@ -500,14 +571,16 @@ class TextFinderTest {
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertTrue(foundTexts.size() >= 3,
"Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'");
assertTrue(
foundTexts.size() >= 3,
"Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'");
}
@Test
@DisplayName("Should find single characters in various contexts")
void findSingleCharacters() throws IOException {
String content = "Grade: A. Section B has item A-1. The letter A appears multiple times.";
String content =
"Grade: A. Section B has item A-1. The letter A appears multiple times.";
addTextToPage(content);
TextFinder textFinder = new TextFinder("A", false, true);
@ -522,24 +595,29 @@ class TextFinderTest {
}
@Test
@DisplayName("Should handle digits at word boundaries correctly")
@DisplayName("Digits as strict standalone tokens (exclude decimals and suffixes)")
void findDigitsAtWordBoundaries() throws IOException {
String content = "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2.";
String content =
"Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2. Price: 2,50€";
addTextToPage(content);
TextFinder textFinder1 = new TextFinder("1", false, true);
textFinder1.getText(document);
List<PDFText> foundTexts1 = textFinder1.getFoundTexts();
assertEquals(1, foundTexts1.size(),
"Should find only the standalone '1' at the beginning");
assertEquals(
1,
foundTexts1.size(),
"Should find only the standalone '1'; do not count the '1' in '1.0' or in 'Item1'.");
TextFinder textFinder2 = new TextFinder("2", false, true);
textFinder2.getText(document);
List<PDFText> foundTexts2 = textFinder2.getFoundTexts();
assertEquals(1, foundTexts2.size(),
"Should find only the standalone '2' in the number list");
assertEquals(
1,
foundTexts2.size(),
"Should find only the standalone '2' in the number list");
}
@Test
@ -566,8 +644,10 @@ class TextFinderTest {
textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(2, foundTexts.size(),
"Should find both '1' instances despite spacing variations");
assertEquals(
2,
foundTexts.size(),
"Should find both '1' instances despite spacing variations");
}
}