refactor(tests): move & expand TextFinder/RedactController tests; fix TextFinder empty search-term handling; update token filtering API (#4264)

# Description of Changes

- **What was changed**
  - Relocated and refactored unit tests:
- `TextFinderTest` and `RedactControllerTest` moved under
`app/core/src/test/...` to align with module structure.
- Expanded test coverage: whole-word vs. partial matches, complex
regexes (emails, SSNs, IPs, currency), international/accented
characters, multi-page documents, malformed PDFs, operator preservation,
color decoding, and performance assertions.
  - **API adjustments in redaction flow**:
- `createTokensWithoutTargetText(...)` now accepts the `PDDocument`
alongside `PDPage` to properly manage resources/streams.
- Introduced/used `createPlaceholderWithFont(...)` to maintain text
width with explicit font context.
  - **Bug fix in `TextFinder`**:
- Early-return when the (trimmed) search term is empty to prevent
unnecessary processing and avoid false positives/errors.
- Minor cleanup (removed redundant `super()` call) and improved guard
logic around regex/whole-word wrapping.

- **Why the change was made**
- Improve reliability and determinism of PDF redaction and text finding
by exercising real-world patterns and edge cases.
- Ensure structural PDF operators (graphics/positioning) are preserved
during token filtering.
- Prevent crashes or misleading matches when users provide
empty/whitespace-only search terms.
- Align tests with the current project layout and increase
maintainability.

---

## Checklist

### General

- [x] I have read the [Contribution
Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md)
- [x] I have read the [Stirling-PDF Developer
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md)
(if applicable)
- [ ] I have read the [How to add new languages to
Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md)
(if applicable)
- [x] I have performed a self-review of my own code
- [x] My changes generate no new warnings

### Documentation

- [ ] I have updated relevant docs on [Stirling-PDF's doc
repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/)
(if functionality has heavily changed)
- [ ] I have read the section [Add New Translation
Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/HowToAddNewLanguage.md#add-new-translation-tags)
(for new translation tags only)

### UI Changes (if applicable)

- [ ] Screenshots or videos demonstrating the UI changes are attached
(e.g., as comments or direct attachments in the PR)

### Testing (if applicable)

- [ ] I have tested my changes locally. Refer to the [Testing
Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/devGuide/DeveloperGuide.md#6-testing)
for more details.
This commit is contained in:
Ludy 2025-08-24 22:20:28 +02:00 committed by GitHub
parent 2baa258e11
commit 9779c75df4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 419 additions and 234 deletions

View File

@ -27,7 +27,6 @@ public class TextFinder extends PDFTextStripper {
public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch) public TextFinder(String searchTerm, boolean useRegex, boolean wholeWordSearch)
throws IOException { throws IOException {
super();
this.searchTerm = searchTerm; this.searchTerm = searchTerm;
this.useRegex = useRegex; this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch; this.wholeWordSearch = wholeWordSearch;
@ -68,11 +67,15 @@ public class TextFinder extends PDFTextStripper {
} }
String processedSearchTerm = this.searchTerm.trim(); String processedSearchTerm = this.searchTerm.trim();
if (processedSearchTerm.isEmpty()) {
super.endPage(page);
return;
}
String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E"; String regex = this.useRegex ? processedSearchTerm : "\\Q" + processedSearchTerm + "\\E";
if (this.wholeWordSearch) { if (this.wholeWordSearch) {
if (processedSearchTerm.length() == 1 if (processedSearchTerm.length() == 1
&& Character.isDigit(processedSearchTerm.charAt(0))) { && Character.isDigit(processedSearchTerm.charAt(0))) {
regex = "(?<![\\w])" + regex + "(?![\\w])"; regex = "(?<![\\w])(?<!\\d[\\.,])" + regex + "(?![\\w])(?![\\.,]\\d)";
} else if (processedSearchTerm.length() == 1) { } else if (processedSearchTerm.length() == 1) {
regex = "(?<![\\w])" + regex + "(?![\\w])"; regex = "(?<![\\w])" + regex + "(?![\\w])";
} else { } else {

View File

@ -57,11 +57,9 @@ class RedactControllerTest {
private static final Logger log = LoggerFactory.getLogger(RedactControllerTest.class); private static final Logger log = LoggerFactory.getLogger(RedactControllerTest.class);
@Mock @Mock private CustomPDFDocumentFactory pdfDocumentFactory;
private CustomPDFDocumentFactory pdfDocumentFactory;
@InjectMocks @InjectMocks private RedactController redactController;
private RedactController redactController;
private MockMultipartFile mockPdfFile; private MockMultipartFile mockPdfFile;
private PDDocument mockDocument; private PDDocument mockDocument;
@ -72,9 +70,15 @@ class RedactControllerTest {
private PDPage realPage; private PDPage realPage;
// Helpers // Helpers
private void testAutoRedaction(String searchText, boolean useRegex, boolean wholeWordSearch, private void testAutoRedaction(
String redactColor, float padding, boolean convertToImage, String searchText,
boolean expectSuccess) throws Exception { boolean useRegex,
boolean wholeWordSearch,
String redactColor,
float padding,
boolean convertToImage,
boolean expectSuccess)
throws Exception {
RedactPdfRequest request = createRedactPdfRequest(); RedactPdfRequest request = createRedactPdfRequest();
request.setListOfText(searchText); request.setListOfText(searchText);
request.setUseRegex(useRegex); request.setUseRegex(useRegex);
@ -103,7 +107,8 @@ class RedactControllerTest {
} }
} }
private void testManualRedaction(List<RedactionArea> redactionAreas, boolean convertToImage) throws Exception { private void testManualRedaction(List<RedactionArea> redactionAreas, boolean convertToImage)
throws Exception {
ManualRedactPdfRequest request = createManualRedactPdfRequest(); ManualRedactPdfRequest request = createManualRedactPdfRequest();
request.setRedactions(redactionAreas); request.setRedactions(redactionAreas);
request.setConvertPDFToImage(convertToImage); request.setConvertPDFToImage(convertToImage);
@ -123,18 +128,16 @@ class RedactControllerTest {
@BeforeEach @BeforeEach
void setUp() throws IOException { void setUp() throws IOException {
mockPdfFile = new MockMultipartFile( mockPdfFile =
"fileInput", new MockMultipartFile(
"test.pdf", "fileInput", "test.pdf", "application/pdf", createSimplePdfContent());
"application/pdf",
createSimplePdfContent()
);
// Mock PDF document and related objects // Mock PDF document and related objects
mockDocument = mock(PDDocument.class); mockDocument = mock(PDDocument.class);
mockPages = mock(PDPageTree.class); mockPages = mock(PDPageTree.class);
mockPage = mock(PDPage.class); mockPage = mock(PDPage.class);
org.apache.pdfbox.pdmodel.PDDocumentCatalog mockCatalog = mock(org.apache.pdfbox.pdmodel.PDDocumentCatalog.class); org.apache.pdfbox.pdmodel.PDDocumentCatalog mockCatalog =
mock(org.apache.pdfbox.pdmodel.PDDocumentCatalog.class);
// Setup document structure properly // Setup document structure properly
when(pdfDocumentFactory.load(any(MockMultipartFile.class))).thenReturn(mockDocument); when(pdfDocumentFactory.load(any(MockMultipartFile.class))).thenReturn(mockDocument);
@ -153,12 +156,14 @@ class RedactControllerTest {
when(mockPage.getMediaBox()).thenReturn(pageRect); when(mockPage.getMediaBox()).thenReturn(pageRect);
when(mockPage.getBBox()).thenReturn(pageRect); when(mockPage.getBBox()).thenReturn(pageRect);
InputStream mockInputStream = new ByteArrayInputStream("BT /F1 12 Tf 100 200 Td (test content) Tj ET".getBytes()); InputStream mockInputStream =
new ByteArrayInputStream("BT /F1 12 Tf 100 200 Td (test content) Tj ET".getBytes());
when(mockPage.getContents()).thenReturn(mockInputStream); when(mockPage.getContents()).thenReturn(mockInputStream);
when(mockPage.hasContents()).thenReturn(true); when(mockPage.hasContents()).thenReturn(true);
org.apache.pdfbox.cos.COSDocument mockCOSDocument = mock(org.apache.pdfbox.cos.COSDocument.class); org.apache.pdfbox.cos.COSDocument mockCOSDocument =
mock(org.apache.pdfbox.cos.COSDocument.class);
org.apache.pdfbox.cos.COSStream mockCOSStream = mock(org.apache.pdfbox.cos.COSStream.class); org.apache.pdfbox.cos.COSStream mockCOSStream = mock(org.apache.pdfbox.cos.COSStream.class);
when(mockDocument.getDocument()).thenReturn(mockCOSDocument); when(mockDocument.getDocument()).thenReturn(mockCOSDocument);
when(mockCOSDocument.createCOSStream()).thenReturn(mockCOSStream); when(mockCOSDocument.createCOSStream()).thenReturn(mockCOSStream);
@ -167,11 +172,14 @@ class RedactControllerTest {
when(mockCOSStream.createOutputStream()).thenReturn(mockOutputStream); when(mockCOSStream.createOutputStream()).thenReturn(mockOutputStream);
when(mockCOSStream.createOutputStream(any())).thenReturn(mockOutputStream); when(mockCOSStream.createOutputStream(any())).thenReturn(mockOutputStream);
doAnswer(invocation -> { doAnswer(
ByteArrayOutputStream baos = invocation.getArgument(0); invocation -> {
baos.write("Mock PDF Content".getBytes()); ByteArrayOutputStream baos = invocation.getArgument(0);
return null; baos.write("Mock PDF Content".getBytes());
}).when(mockDocument).save(any(ByteArrayOutputStream.class)); return null;
})
.when(mockDocument)
.save(any(ByteArrayOutputStream.class));
doNothing().when(mockDocument).close(); doNothing().when(mockDocument).close();
// Initialize a real document for unit tests // Initialize a real document for unit tests
@ -185,7 +193,8 @@ class RedactControllerTest {
// Set up basic page resources // Set up basic page resources
PDResources resources = new PDResources(); PDResources resources = new PDResources();
resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); resources.put(
COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.setResources(resources); realPage.setResources(resources);
} }
@ -222,7 +231,14 @@ class RedactControllerTest {
@Test @Test
@DisplayName("Should redact multiple search terms") @DisplayName("Should redact multiple search terms")
void redactMultipleSearchTerms() throws Exception { void redactMultipleSearchTerms() throws Exception {
testAutoRedaction("confidential\nsecret\nprivate\nclassified", false, true, "#FF0000", 2.0f, false, true); testAutoRedaction(
"confidential\nsecret\nprivate\nclassified",
false,
true,
"#FF0000",
2.0f,
false,
true);
} }
@Test @Test
@ -250,8 +266,12 @@ class RedactControllerTest {
when(page.getBBox()).thenReturn(pageRect); when(page.getBBox()).thenReturn(pageRect);
when(page.hasContents()).thenReturn(true); when(page.hasContents()).thenReturn(true);
InputStream mockInputStream = new ByteArrayInputStream( InputStream mockInputStream =
("BT /F1 12 Tf 100 200 Td (page " + i + " content with confidential info) Tj ET").getBytes()); new ByteArrayInputStream(
("BT /F1 12 Tf 100 200 Td (page "
+ i
+ " content with confidential info) Tj ET")
.getBytes());
when(page.getContents()).thenReturn(mockInputStream); when(page.getContents()).thenReturn(mockInputStream);
pageList.add(page); pageList.add(page);
@ -285,7 +305,8 @@ class RedactControllerTest {
when(mockPages.get(0)).thenReturn(mockPage); when(mockPages.get(0)).thenReturn(mockPage);
org.apache.pdfbox.pdmodel.PDDocumentInformation mockInfo = mock(org.apache.pdfbox.pdmodel.PDDocumentInformation.class); org.apache.pdfbox.pdmodel.PDDocumentInformation mockInfo =
mock(org.apache.pdfbox.pdmodel.PDDocumentInformation.class);
when(mockDocument.getDocumentInformation()).thenReturn(mockInfo); when(mockDocument.getDocumentInformation()).thenReturn(mockInfo);
ResponseEntity<byte[]> response = redactController.redactPdf(request); ResponseEntity<byte[]> response = redactController.redactPdf(request);
@ -311,23 +332,27 @@ class RedactControllerTest {
@Test @Test
@DisplayName("Should handle email pattern redaction") @DisplayName("Should handle email pattern redaction")
void handleEmailPatternRedaction() throws Exception { void handleEmailPatternRedaction() throws Exception {
testAutoRedaction("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false, "#0000FF", 1.5f, false, true); testAutoRedaction(
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
true, false, "#0000FF", 1.5f, false, true);
} }
@Test @Test
@DisplayName("Should handle phone number patterns") @DisplayName("Should handle phone number patterns")
void handlePhoneNumberPatterns() throws Exception { void handlePhoneNumberPatterns() throws Exception {
testAutoRedaction("\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", true, false, "#FF0000", 1.0f, false, true); testAutoRedaction(
"\\(\\d{3}\\)\\s*\\d{3}-\\d{4}", true, false, "#FF0000", 1.0f, false, true);
} }
@ParameterizedTest @ParameterizedTest
@ValueSource(strings = { @ValueSource(
"\\d{3}-\\d{2}-\\d{4}", // SSN pattern strings = {
"\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}", // Credit card pattern "\\d{3}-\\d{2}-\\d{4}", // SSN pattern
"\\b[A-Z]{2,}\\b", // Uppercase words "\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}[\\s-]?\\d{4}", // Credit card pattern
"\\$\\d+\\.\\d{2}", // Currency pattern "\\b[A-Z]{2,}\\b", // Uppercase words
"\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b" // IP address pattern "\\$\\d+\\.\\d{2}", // Currency pattern
}) "\\b\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\b" // IP address pattern
})
@DisplayName("Should handle various regex patterns") @DisplayName("Should handle various regex patterns")
void handleVariousRegexPatterns(String regexPattern) throws Exception { void handleVariousRegexPatterns(String regexPattern) throws Exception {
testAutoRedaction(regexPattern, true, false, "#000000", 1.0f, false, true); testAutoRedaction(regexPattern, true, false, "#000000", 1.0f, false, true);
@ -519,8 +544,10 @@ class RedactControllerTest {
when(page.getBBox()).thenReturn(pageRect); when(page.getBBox()).thenReturn(pageRect);
when(page.hasContents()).thenReturn(true); when(page.hasContents()).thenReturn(true);
InputStream mockInputStream = new ByteArrayInputStream( InputStream mockInputStream =
("BT /F1 12 Tf 100 200 Td (page " + i + " content) Tj ET").getBytes()); new ByteArrayInputStream(
("BT /F1 12 Tf 100 200 Td (page " + i + " content) Tj ET")
.getBytes());
when(page.getContents()).thenReturn(mockInputStream); when(page.getContents()).thenReturn(mockInputStream);
pageList.add(page); pageList.add(page);
@ -588,36 +615,38 @@ class RedactControllerTest {
request.setFileInput(null); request.setFileInput(null);
request.setListOfText("test"); request.setListOfText("test");
assertDoesNotThrow(() -> { assertDoesNotThrow(
try { () -> {
redactController.redactPdf(request); try {
} catch (Exception e) { redactController.redactPdf(request);
assertNotNull(e); } catch (Exception e) {
} assertNotNull(e);
}); }
});
} }
@Test @Test
@DisplayName("Should handle malformed PDF gracefully") @DisplayName("Should handle malformed PDF gracefully")
void handleMalformedPdfGracefully() throws Exception { void handleMalformedPdfGracefully() throws Exception {
MockMultipartFile malformedFile = new MockMultipartFile( MockMultipartFile malformedFile =
"fileInput", new MockMultipartFile(
"malformed.pdf", "fileInput",
"application/pdf", "malformed.pdf",
"Not a real PDF content".getBytes() "application/pdf",
); "Not a real PDF content".getBytes());
RedactPdfRequest request = new RedactPdfRequest(); RedactPdfRequest request = new RedactPdfRequest();
request.setFileInput(malformedFile); request.setFileInput(malformedFile);
request.setListOfText("test"); request.setListOfText("test");
assertDoesNotThrow(() -> { assertDoesNotThrow(
try { () -> {
redactController.redactPdf(request); try {
} catch (Exception e) { redactController.redactPdf(request);
assertNotNull(e); } catch (Exception e) {
} assertNotNull(e);
}); }
});
} }
@Test @Test
@ -723,14 +752,24 @@ class RedactControllerTest {
} }
@ParameterizedTest @ParameterizedTest
@ValueSource(strings = {"#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00", "0000FF"}) @ValueSource(
strings = {
"#FF0000", "#00FF00", "#0000FF", "#FFFFFF", "#000000", "FF0000", "00FF00",
"0000FF"
})
@DisplayName("Should handle various valid color formats") @DisplayName("Should handle various valid color formats")
void handleVariousValidColorFormats(String colorInput) throws Exception { void handleVariousValidColorFormats(String colorInput) throws Exception {
Color result = redactController.decodeOrDefault(colorInput); Color result = redactController.decodeOrDefault(colorInput);
assertNotNull(result); assertNotNull(result);
assertTrue(result.getRed() >= 0 && result.getRed() <= 255, "Red component should be in valid range"); assertTrue(
assertTrue(result.getGreen() >= 0 && result.getGreen() <= 255, "Green component should be in valid range"); result.getRed() >= 0 && result.getRed() <= 255,
assertTrue(result.getBlue() >= 0 && result.getBlue() <= 255, "Blue component should be in valid range"); "Red component should be in valid range");
assertTrue(
result.getGreen() >= 0 && result.getGreen() <= 255,
"Green component should be in valid range");
assertTrue(
result.getBlue() >= 0 && result.getBlue() <= 255,
"Blue component should be in valid range");
} }
@Test @Test
@ -755,16 +794,18 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("confidential"); Set<String> targetWords = Set.of("confidential");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
assertNotNull(tokens); assertNotNull(tokens);
assertFalse(tokens.isEmpty()); assertFalse(tokens.isEmpty());
String reconstructedText = extractTextFromTokens(tokens); String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("confidential"), assertFalse(
"Target text should be replaced with placeholder"); reconstructedText.contains("confidential"),
assertTrue(reconstructedText.contains("document"), "Target text should be replaced with placeholder");
"Non-target text should remain"); assertTrue(reconstructedText.contains("document"), "Non-target text should remain");
} }
@Test @Test
@ -774,7 +815,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("secret"); Set<String> targetWords = Set.of("secret");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
assertNotNull(tokens); assertNotNull(tokens);
@ -785,7 +828,9 @@ class RedactControllerTest {
if (array.getObject(i) instanceof COSString cosString) { if (array.getObject(i) instanceof COSString cosString) {
String text = cosString.getString(); String text = cosString.getString();
if (text.contains("secret")) { if (text.contains("secret")) {
fail("Target text 'secret' should have been redacted from TJ array"); fail(
"Target text 'secret' should have been redacted from TJ"
+ " array");
} }
foundModifiedTJArray = true; foundModifiedTJArray = true;
} }
@ -803,21 +848,33 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("redact"); Set<String> targetWords = Set.of("redact");
List<Object> originalTokens = getOriginalTokens(); List<Object> originalTokens = getOriginalTokens();
List<Object> filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); List<Object> filteredTokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
long originalNonTextCount = originalTokens.stream() long originalNonTextCount =
.filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName())) originalTokens.stream()
.count(); .filter(
token ->
token instanceof Operator op
&& !redactController.isTextShowingOperator(
op.getName()))
.count();
long filteredNonTextCount = filteredTokens.stream() long filteredNonTextCount =
.filter(token -> token instanceof Operator op && !redactController.isTextShowingOperator(op.getName())) filteredTokens.stream()
.count(); .filter(
token ->
token instanceof Operator op
&& !redactController.isTextShowingOperator(
op.getName()))
.count();
assertTrue(filteredNonTextCount > 0, assertTrue(filteredNonTextCount > 0, "Non-text operators should be preserved");
"Non-text operators should be preserved");
assertTrue(filteredNonTextCount >= originalNonTextCount / 2, assertTrue(
"A reasonable number of non-text operators should be preserved"); filteredNonTextCount >= originalNonTextCount / 2,
"A reasonable number of non-text operators should be preserved");
} }
@Test @Test
@ -827,7 +884,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("\\d{3}-\\d{2}-\\d{4}"); // SSN pattern Set<String> targetWords = Set.of("\\d{3}-\\d{2}-\\d{4}"); // SSN pattern
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, true, false); List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, true, false);
String reconstructedText = extractTextFromTokens(tokens); String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("111-22-3333"), "SSN should be redacted"); assertFalse(reconstructedText.contains("111-22-3333"), "SSN should be redacted");
@ -841,7 +900,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("test"); Set<String> targetWords = Set.of("test");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, true); List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, true);
String reconstructedText = extractTextFromTokens(tokens); String reconstructedText = extractTextFromTokens(tokens);
assertTrue(reconstructedText.contains("testing"), "Partial matches should remain"); assertTrue(reconstructedText.contains("testing"), "Partial matches should remain");
@ -856,11 +917,14 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("sensitive"); Set<String> targetWords = Set.of("sensitive");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
String reconstructedText = extractTextFromTokens(tokens); String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("sensitive"), assertFalse(
"Text should be redacted regardless of operator type"); reconstructedText.contains("sensitive"),
"Text should be redacted regardless of operator type");
} }
@Test @Test
@ -884,7 +948,10 @@ class RedactControllerTest {
void shouldHandleEmptyTokenList() throws Exception { void shouldHandleEmptyTokenList() throws Exception {
List<Object> emptyTokens = Collections.emptyList(); List<Object> emptyTokens = Collections.emptyList();
assertDoesNotThrow(() -> redactController.writeFilteredContentStream(realDocument, realPage, emptyTokens)); assertDoesNotThrow(
() ->
redactController.writeFilteredContentStream(
realDocument, realPage, emptyTokens));
assertNotNull(realPage.getContents(), "Page should still have content stream"); assertNotNull(realPage.getContents(), "Page should still have content stream");
} }
@ -906,20 +973,27 @@ class RedactControllerTest {
@DisplayName("Placeholder creation should maintain text width") @DisplayName("Placeholder creation should maintain text width")
void shouldCreateWidthMatchingPlaceholder() throws Exception { void shouldCreateWidthMatchingPlaceholder() throws Exception {
String originalText = "confidential"; String originalText = "confidential";
String placeholder = redactController.createPlaceholder(originalText); String placeholder =
redactController.createPlaceholderWithFont(
originalText, new PDType1Font(Standard14Fonts.FontName.HELVETICA));
assertEquals(originalText.length(), placeholder.length(), assertEquals(
"Placeholder should maintain character count for width preservation"); originalText.length(),
placeholder.length(),
"Placeholder should maintain character count for width preservation");
} }
@Test @Test
@DisplayName("Placeholder should handle special characters") @DisplayName("Placeholder should handle special characters")
void shouldHandleSpecialCharactersInPlaceholder() throws Exception { void shouldHandleSpecialCharactersInPlaceholder() throws Exception {
String originalText = "café naïve"; String originalText = "café naïve";
String placeholder = redactController.createPlaceholder(originalText); String placeholder =
redactController.createPlaceholderWithFont(
originalText, new PDType1Font(Standard14Fonts.FontName.HELVETICA));
assertEquals(originalText.length(), placeholder.length()); assertEquals(originalText.length(), placeholder.length());
assertFalse(placeholder.contains("café"), "Placeholder should not contain original text"); assertFalse(
placeholder.contains("café"), "Placeholder should not contain original text");
} }
@Test @Test
@ -929,7 +1003,9 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("secret"); Set<String> targetWords = Set.of("secret");
List<Object> filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); List<Object> filteredTokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
redactController.writeFilteredContentStream(realDocument, realPage, filteredTokens); redactController.writeFilteredContentStream(realDocument, realPage, filteredTokens);
assertNotNull(realPage.getContents()); assertNotNull(realPage.getContents());
@ -946,15 +1022,21 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("confidential"); Set<String> targetWords = Set.of("confidential");
List<Object> filteredTokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); List<Object> filteredTokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
long filteredPositioning = filteredTokens.stream() long filteredPositioning =
.filter(token -> token instanceof Operator op && filteredTokens.stream()
(op.getName().equals("Td") || op.getName().equals("TD") || op.getName().equals("Tm"))) .filter(
.count(); token ->
token instanceof Operator op
&& ("Td".equals(op.getName())
|| "TD".equals(op.getName())
|| "Tm".equals(op.getName())))
.count();
assertTrue(filteredPositioning > 0, assertTrue(filteredPositioning > 0, "Positioning operators should be preserved");
"Positioning operators should be preserved");
} }
@Test @Test
@ -966,16 +1048,21 @@ class RedactControllerTest {
} }
realDocument.addPage(realPage); realDocument.addPage(realPage);
realPage.setResources(new PDResources()); realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); realPage.getResources()
.put(
COSName.getPDFName("F1"),
new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { try (PDPageContentStream contentStream =
new PDPageContentStream(realDocument, realPage)) {
contentStream.setLineWidth(2); contentStream.setLineWidth(2);
contentStream.moveTo(100, 100); contentStream.moveTo(100, 100);
contentStream.lineTo(200, 200); contentStream.lineTo(200, 200);
contentStream.stroke(); contentStream.stroke();
contentStream.beginText(); contentStream.beginText();
contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); contentStream.setFont(
realPage.getResources().getFont(COSName.getPDFName("F1")), 12);
contentStream.newLineAtOffset(50, 750); contentStream.newLineAtOffset(50, 750);
contentStream.showText("This is a complex document with "); contentStream.showText("This is a complex document with ");
contentStream.setTextRise(5); contentStream.setTextRise(5);
@ -990,19 +1077,27 @@ class RedactControllerTest {
Set<String> targetWords = Set.of("confidential"); Set<String> targetWords = Set.of("confidential");
List<Object> tokens = redactController.createTokensWithoutTargetText(realPage, targetWords, false, false); List<Object> tokens =
redactController.createTokensWithoutTargetText(
realDocument, realPage, targetWords, false, false);
assertNotNull(tokens); assertNotNull(tokens);
assertFalse(tokens.isEmpty()); assertFalse(tokens.isEmpty());
String reconstructedText = extractTextFromTokens(tokens); String reconstructedText = extractTextFromTokens(tokens);
assertFalse(reconstructedText.contains("confidential"), "Target text should be redacted"); assertFalse(
reconstructedText.contains("confidential"), "Target text should be redacted");
boolean hasGraphicsOperators = tokens.stream() boolean hasGraphicsOperators =
.anyMatch(token -> token instanceof Operator op && tokens.stream()
(op.getName().equals("re") || op.getName().equals("f") || .anyMatch(
op.getName().equals("m") || op.getName().equals("l") || token ->
op.getName().equals("S"))); token instanceof Operator op
&& ("re".equals(op.getName())
|| "f".equals(op.getName())
|| "m".equals(op.getName())
|| "l".equals(op.getName())
|| "S".equals(op.getName())));
assertTrue(hasGraphicsOperators, "Graphics operators should be preserved"); assertTrue(hasGraphicsOperators, "Graphics operators should be preserved");
} }
@ -1019,10 +1114,12 @@ class RedactControllerTest {
// Create resources // Create resources
PDResources resources = new PDResources(); PDResources resources = new PDResources();
resources.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); resources.put(
COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
realPage.setResources(resources); realPage.setResources(resources);
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { try (PDPageContentStream contentStream =
new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText(); contentStream.beginText();
contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12);
contentStream.newLineAtOffset(50, 750); contentStream.newLineAtOffset(50, 750);
@ -1180,7 +1277,8 @@ class RedactControllerTest {
} }
realDocument.addPage(realPage); realDocument.addPage(realPage);
realPage.setResources(new PDResources()); realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText(); contentStream.beginText();
@ -1198,7 +1296,8 @@ class RedactControllerTest {
} }
realDocument.addPage(realPage); realDocument.addPage(realPage);
realPage.setResources(new PDResources()); realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText(); contentStream.beginText();
@ -1221,7 +1320,8 @@ class RedactControllerTest {
} }
realDocument.addPage(realPage); realDocument.addPage(realPage);
realPage.setResources(new PDResources()); realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.setLineWidth(2); contentStream.setLineWidth(2);
@ -1248,7 +1348,8 @@ class RedactControllerTest {
} }
realDocument.addPage(realPage); realDocument.addPage(realPage);
realPage.setResources(new PDResources()); realPage.setResources(new PDResources());
realPage.getResources().put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA)); realPage.getResources()
.put(COSName.getPDFName("F1"), new PDType1Font(Standard14Fonts.FontName.HELVETICA));
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) { try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, realPage)) {
contentStream.beginText(); contentStream.beginText();
@ -1266,28 +1367,29 @@ class RedactControllerTest {
// Helper for token creation // Helper for token creation
private List<Object> createSampleTokenList() { private List<Object> createSampleTokenList() {
return List.of( return List.of(
Operator.getOperator("BT"), Operator.getOperator("BT"),
COSName.getPDFName("F1"), COSName.getPDFName("F1"),
new COSFloat(12), new COSFloat(12),
Operator.getOperator("Tf"), Operator.getOperator("Tf"),
new COSString("Sample text"), new COSString("Sample text"),
Operator.getOperator("Tj"), Operator.getOperator("Tj"),
Operator.getOperator("ET") Operator.getOperator("ET"));
);
} }
private List<Object> getOriginalTokens() throws Exception { private List<Object> getOriginalTokens() throws Exception {
// Create a new page to avoid side effects from other tests // Create a new page to avoid side effects from other tests
PDPage pageForTokenExtraction = new PDPage(PDRectangle.A4); PDPage pageForTokenExtraction = new PDPage(PDRectangle.A4);
pageForTokenExtraction.setResources(realPage.getResources()); pageForTokenExtraction.setResources(realPage.getResources());
try (PDPageContentStream contentStream = new PDPageContentStream(realDocument, pageForTokenExtraction)) { try (PDPageContentStream contentStream =
contentStream.beginText(); new PDPageContentStream(realDocument, pageForTokenExtraction)) {
contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12); contentStream.beginText();
contentStream.newLineAtOffset(50, 750); contentStream.setFont(realPage.getResources().getFont(COSName.getPDFName("F1")), 12);
contentStream.showText("Original content"); contentStream.newLineAtOffset(50, 750);
contentStream.endText(); contentStream.showText("Original content");
contentStream.endText();
} }
return redactController.createTokensWithoutTargetText(pageForTokenExtraction, Collections.emptySet(), false, false); return redactController.createTokensWithoutTargetText(
realDocument, pageForTokenExtraction, Collections.emptySet(), false, false);
} }
private String extractTextFromTokens(List<Object> tokens) { private String extractTextFromTokens(List<Object> tokens) {

View File

@ -1,5 +1,11 @@
package stirling.software.SPDF.pdf; package stirling.software.SPDF.pdf;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
@ -10,11 +16,6 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Nested;
@ -34,33 +35,44 @@ class TextFinderTest {
private PDPage page; private PDPage page;
// Helpers // Helpers
private void testTextFinding(String pageContent, String searchTerm, boolean useRegex, boolean wholeWord, private void testTextFinding(
String[] expectedTexts, int expectedCount) throws IOException { String pageContent,
String searchTerm,
boolean useRegex,
boolean wholeWord,
String[] expectedTexts,
int expectedCount)
throws IOException {
addTextToPage(pageContent); addTextToPage(pageContent);
TextFinder textFinder = new TextFinder(searchTerm, useRegex, wholeWord); TextFinder textFinder = new TextFinder(searchTerm, useRegex, wholeWord);
textFinder.getText(document); textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts(); List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(expectedCount, foundTexts.size(), assertEquals(
String.format("Expected %d matches for search term '%s'", expectedCount, searchTerm)); expectedCount,
foundTexts.size(),
String.format(
"Expected %d matches for search term '%s'", expectedCount, searchTerm));
if (expectedTexts != null) { if (expectedTexts != null) {
for (String expectedText : expectedTexts) { for (String expectedText : expectedTexts) {
assertTrue(foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)), assertTrue(
String.format("Expected to find text: '%s'", expectedText)); foundTexts.stream().anyMatch(text -> text.getText().equals(expectedText)),
String.format("Expected to find text: '%s'", expectedText));
} }
} }
// Verify basic properties of found texts // Verify basic properties of found texts
foundTexts.forEach(text -> { foundTexts.forEach(
assertNotNull(text.getText()); text -> {
assertTrue(text.getX1() >= 0); assertNotNull(text.getText());
assertTrue(text.getY1() >= 0); assertTrue(text.getX1() >= 0);
assertTrue(text.getX2() >= text.getX1()); assertTrue(text.getY1() >= 0);
assertTrue(text.getY2() >= text.getY1()); assertTrue(text.getX2() >= text.getX1());
assertEquals(0, text.getPageIndex()); // Single page test assertTrue(text.getY2() >= text.getY1());
}); assertEquals(0, text.getPageIndex()); // Single page test
});
} }
@BeforeEach @BeforeEach
@ -84,25 +96,37 @@ class TextFinderTest {
@Test @Test
@DisplayName("Should find simple text correctly") @DisplayName("Should find simple text correctly")
void findSimpleText() throws IOException { void findSimpleText() throws IOException {
testTextFinding("This is a confidential document with secret information.", testTextFinding(
"confidential", false, false, "This is a confidential document with secret information.",
new String[]{"confidential"}, 1); "confidential",
false,
false,
new String[] {"confidential"},
1);
} }
@Test @Test
@DisplayName("Should perform case-insensitive search") @DisplayName("Should perform case-insensitive search")
void performCaseInsensitiveSearch() throws IOException { void performCaseInsensitiveSearch() throws IOException {
testTextFinding("This document contains CONFIDENTIAL information.", testTextFinding(
"confidential", false, false, "This document contains CONFIDENTIAL information.",
new String[]{"CONFIDENTIAL"}, 1); "confidential",
false,
false,
new String[] {"CONFIDENTIAL"},
1);
} }
@Test @Test
@DisplayName("Should find multiple occurrences of same term") @DisplayName("Should find multiple occurrences of same term")
void findMultipleOccurrences() throws IOException { void findMultipleOccurrences() throws IOException {
testTextFinding("The secret code is secret123. Keep this secret safe!", testTextFinding(
"secret", false, false, "The secret code is secret123. Keep this secret safe!",
new String[]{"secret", "secret", "secret"}, 3); "secret",
false,
false,
new String[] {"secret", "secret", "secret"},
3);
} }
@Test @Test
@ -131,33 +155,49 @@ class TextFinderTest {
@Test @Test
@DisplayName("Should find only whole words when enabled") @DisplayName("Should find only whole words when enabled")
void findOnlyWholeWords() throws IOException { void findOnlyWholeWords() throws IOException {
testTextFinding("This is a test testing document with tested results.", testTextFinding(
"test", false, true, "This is a test testing document with tested results.",
new String[]{"test"}, 1); "test",
false,
true,
new String[] {"test"},
1);
} }
@Test @Test
@DisplayName("Should find partial matches when whole word search disabled") @DisplayName("Should find partial matches when whole word search disabled")
void findPartialMatches() throws IOException { void findPartialMatches() throws IOException {
testTextFinding("This is a test testing document with tested results.", testTextFinding(
"test", false, false, "This is a test testing document with tested results.",
new String[]{"test", "test", "test"}, 3); "test",
false,
false,
new String[] {"test", "test", "test"},
3);
} }
@Test @Test
@DisplayName("Should handle punctuation boundaries correctly") @DisplayName("Should handle punctuation boundaries correctly")
void handlePunctuationBoundaries() throws IOException { void handlePunctuationBoundaries() throws IOException {
testTextFinding("Hello, world! Testing: test-case (test).", testTextFinding(
"test", false, true, "Hello, world! Testing: test-case (test).",
new String[]{"test"}, 2); // Both standalone "test" and "test" in "test-case" "test",
false,
true,
new String[] {"test"},
2); // Both standalone "test" and "test" in "test-case"
} }
@Test @Test
@DisplayName("Should handle word boundaries with special characters") @DisplayName("Should handle word boundaries with special characters")
void handleSpecialCharacterBoundaries() throws IOException { void handleSpecialCharacterBoundaries() throws IOException {
testTextFinding("Email: test@example.com and test.txt file", testTextFinding(
"test", false, true, "Email: test@example.com and test.txt file",
new String[]{"test"}, 2); // Both in email and filename should match "test",
false,
true,
new String[] {"test"},
2); // Both in email and filename should match
} }
} }
@ -168,46 +208,64 @@ class TextFinderTest {
@Test @Test
@DisplayName("Should find text matching regex pattern") @DisplayName("Should find text matching regex pattern")
void findTextMatchingRegex() throws IOException { void findTextMatchingRegex() throws IOException {
testTextFinding("Contact John at 123-45-6789 or Jane at 987-65-4321 for details.", testTextFinding(
"\\d{3}-\\d{2}-\\d{4}", true, false, "Contact John at 123-45-6789 or Jane at 987-65-4321 for details.",
new String[]{"123-45-6789", "987-65-4321"}, 2); "\\d{3}-\\d{2}-\\d{4}",
true,
false,
new String[] {"123-45-6789", "987-65-4321"},
2);
} }
@Test @Test
@DisplayName("Should find email addresses with regex") @DisplayName("Should find email addresses with regex")
void findEmailAddresses() throws IOException { void findEmailAddresses() throws IOException {
testTextFinding("Email: test@example.com and admin@test.org", testTextFinding(
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, false, "Email: test@example.com and admin@test.org",
new String[]{"test@example.com", "admin@test.org"}, 2); "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
true,
false,
new String[] {"test@example.com", "admin@test.org"},
2);
} }
@Test @Test
@DisplayName("Should combine regex with whole word search") @DisplayName("Should combine regex with whole word search")
void combineRegexWithWholeWord() throws IOException { void combineRegexWithWholeWord() throws IOException {
testTextFinding("Email: test@example.com and admin@test.org", testTextFinding(
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", true, true, "Email: test@example.com and admin@test.org",
new String[]{"test@example.com", "admin@test.org"}, 2); "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
true,
true,
new String[] {"test@example.com", "admin@test.org"},
2);
} }
@Test @Test
@DisplayName("Should find currency patterns") @DisplayName("Should find currency patterns")
void findCurrencyPatterns() throws IOException { void findCurrencyPatterns() throws IOException {
testTextFinding("Price: $100.50 and €75.25", testTextFinding(
"\\$\\d+\\.\\d{2}", true, false, "Price: $100.50 and €75.25",
new String[]{"$100.50"}, 1); "\\$\\d+\\.\\d{2}",
true,
false,
new String[] {"$100.50"},
1);
} }
@ParameterizedTest @ParameterizedTest
@ValueSource(strings = { @ValueSource(
"\\d{4}-\\d{2}-\\d{2}", // Date pattern strings = {
"\\b[A-Z]{2,}\\b", // Uppercase words "\\d{4}-\\d{2}-\\d{2}", // Date pattern
"\\w+@\\w+\\.\\w+", // Simple email pattern "\\b[A-Z]{2,}\\b", // Uppercase words
"\\$\\d+", // Simple currency "\\w+@\\w+\\.\\w+", // Simple email pattern
"\\b\\d{3,4}\\b" // 3-4 digit numbers "\\$\\d+", // Simple currency
}) "\\b\\d{3,4}\\b" // 3-4 digit numbers
})
@DisplayName("Should handle various regex patterns") @DisplayName("Should handle various regex patterns")
void handleVariousRegexPatterns(String regexPattern) throws IOException { void handleVariousRegexPatterns(String regexPattern) throws IOException {
String testContent = "Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234"; String testContent =
"Date: 2023-12-25, Email: test@domain.com, Price: $250, Code: ABC123, Number: 1234";
addTextToPage(testContent); addTextToPage(testContent);
TextFinder textFinder = new TextFinder(regexPattern, true, false); TextFinder textFinder = new TextFinder(regexPattern, true, false);
@ -215,7 +273,9 @@ class TextFinderTest {
List<PDFText> foundTexts = textFinder.getFoundTexts(); List<PDFText> foundTexts = textFinder.getFoundTexts();
// Each pattern should find at least one match in our test content // Each pattern should find at least one match in our test content
assertFalse(foundTexts.isEmpty(), String.format("Pattern '%s' should find at least one match", regexPattern)); assertFalse(
foundTexts.isEmpty(),
String.format("Pattern '%s' should find at least one match", regexPattern));
} }
@Test @Test
@ -230,9 +290,10 @@ class TextFinderTest {
assertNotNull(foundTexts); assertNotNull(foundTexts);
} catch (java.util.regex.PatternSyntaxException e) { } catch (java.util.regex.PatternSyntaxException e) {
assertNotNull(e.getMessage()); assertNotNull(e.getMessage());
assertTrue(e.getMessage().contains("Unclosed character class") || assertTrue(
e.getMessage().contains("syntax"), e.getMessage().contains("Unclosed character class")
"Exception should indicate regex syntax error"); || e.getMessage().contains("syntax"),
"Exception should indicate regex syntax error");
} catch (RuntimeException | IOException e) { } catch (RuntimeException | IOException e) {
assertNotNull(e.getMessage()); assertNotNull(e.getMessage());
} }
@ -246,33 +307,38 @@ class TextFinderTest {
@Test @Test
@DisplayName("Should handle international characters") @DisplayName("Should handle international characters")
void handleInternationalCharacters() throws IOException { void handleInternationalCharacters() throws IOException {
testTextFinding("Hello café naïve résumé", testTextFinding(
"café", false, false, "Hello café naïve résumé", "café", false, false, new String[] {"café"}, 1);
new String[]{"café"}, 1);
} }
@Test @Test
@DisplayName("Should find text with accented characters") @DisplayName("Should find text with accented characters")
void findAccentedCharacters() throws IOException { void findAccentedCharacters() throws IOException {
testTextFinding("Café, naïve, résumé, piñata", testTextFinding(
"café", false, false, "Café, naïve, résumé, piñata",
new String[]{"Café"}, 1); // Case insensitive "café",
false,
false,
new String[] {"Café"},
1); // Case insensitive
} }
@Test @Test
@DisplayName("Should handle special symbols") @DisplayName("Should handle special symbols")
void handleSpecialSymbols() throws IOException { void handleSpecialSymbols() throws IOException {
testTextFinding("Symbols: © ® ™ ± × ÷ § ¶", testTextFinding("Symbols: © ® ™ ± × ÷ § ¶", "©", false, false, new String[] {"©"}, 1);
"©", false, false,
new String[]{"©"}, 1);
} }
@Test @Test
@DisplayName("Should find currency symbols") @DisplayName("Should find currency symbols")
void findCurrencySymbols() throws IOException { void findCurrencySymbols() throws IOException {
testTextFinding("Prices: $100 €75 £50 ¥1000", testTextFinding(
"[€£¥]", true, false, "Prices: $100 €75 £50 ¥1000",
new String[]{"", "£", "¥"}, 3); "[€£¥]",
true,
false,
new String[] {"", "£", "¥"},
3);
} }
} }
@ -330,7 +396,7 @@ class TextFinderTest {
String longTerm = "a".repeat(1000); String longTerm = "a".repeat(1000);
String content = "Short text with " + longTerm + " embedded."; String content = "Short text with " + longTerm + " embedded.";
testTextFinding(content, longTerm, false, false, new String[]{longTerm}, 1); testTextFinding(content, longTerm, false, false, new String[] {longTerm}, 1);
} }
@Test @Test
@ -350,8 +416,9 @@ class TextFinderTest {
long endTime = System.currentTimeMillis(); long endTime = System.currentTimeMillis();
assertEquals(10, foundTexts.size()); assertEquals(10, foundTexts.size());
assertTrue(endTime - startTime < 3000, assertTrue(
"Multi-page search should complete within 3 seconds"); endTime - startTime < 3000,
"Multi-page search should complete within 3 seconds");
} }
} }
@ -402,12 +469,13 @@ class TextFinderTest {
String complexRegex = "(?=.*\\d)(?=.*[a-z])(?=.*[A-Z])[a-zA-Z\\d]{6}"; String complexRegex = "(?=.*\\d)(?=.*[a-z])(?=.*[A-Z])[a-zA-Z\\d]{6}";
assertDoesNotThrow(() -> { assertDoesNotThrow(
TextFinder textFinder = new TextFinder(complexRegex, true, false); () -> {
textFinder.getText(document); TextFinder textFinder = new TextFinder(complexRegex, true, false);
List<PDFText> foundTexts = textFinder.getFoundTexts(); textFinder.getText(document);
assertNotNull(foundTexts); List<PDFText> foundTexts = textFinder.getFoundTexts();
}); assertNotNull(foundTexts);
});
} }
@ParameterizedTest @ParameterizedTest
@ -464,10 +532,11 @@ class TextFinderTest {
List<PDFText> foundTexts = textFinder.getFoundTexts(); List<PDFText> foundTexts = textFinder.getFoundTexts();
assertFalse(foundTexts.isEmpty()); assertFalse(foundTexts.isEmpty());
foundTexts.forEach(text -> { foundTexts.forEach(
assertNotNull(text.getText()); text -> {
assertTrue(text.getX1() >= 0 && text.getY1() >= 0); assertNotNull(text.getText());
}); assertTrue(text.getX1() >= 0 && text.getY1() >= 0);
});
} }
} }
@ -485,8 +554,10 @@ class TextFinderTest {
textFinder.getText(document); textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts(); List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(1, foundTexts.size(), assertEquals(
"Should find exactly one standalone '1', not the ones embedded in other numbers/codes"); 1,
foundTexts.size(),
"Should find exactly one standalone '1', not the ones embedded in other numbers/codes");
assertEquals("1", foundTexts.get(0).getText()); assertEquals("1", foundTexts.get(0).getText());
} }
@ -500,14 +571,16 @@ class TextFinderTest {
textFinder.getText(document); textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts(); List<PDFText> foundTexts = textFinder.getFoundTexts();
assertTrue(foundTexts.size() >= 3, assertTrue(
"Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'"); foundTexts.size() >= 3,
"Should find multiple instances of '1' including standalone, in '1234', and in 'A1B'");
} }
@Test @Test
@DisplayName("Should find single characters in various contexts") @DisplayName("Should find single characters in various contexts")
void findSingleCharacters() throws IOException { void findSingleCharacters() throws IOException {
String content = "Grade: A. Section B has item A-1. The letter A appears multiple times."; String content =
"Grade: A. Section B has item A-1. The letter A appears multiple times.";
addTextToPage(content); addTextToPage(content);
TextFinder textFinder = new TextFinder("A", false, true); TextFinder textFinder = new TextFinder("A", false, true);
@ -522,24 +595,29 @@ class TextFinderTest {
} }
@Test @Test
@DisplayName("Should handle digits at word boundaries correctly") @DisplayName("Digits as strict standalone tokens (exclude decimals and suffixes)")
void findDigitsAtWordBoundaries() throws IOException { void findDigitsAtWordBoundaries() throws IOException {
String content = "Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2."; String content =
"Numbers: 1, 2, 3. Code: 123. Version: 1.0. Item1 and Item2. Price: 2,50€";
addTextToPage(content); addTextToPage(content);
TextFinder textFinder1 = new TextFinder("1", false, true); TextFinder textFinder1 = new TextFinder("1", false, true);
textFinder1.getText(document); textFinder1.getText(document);
List<PDFText> foundTexts1 = textFinder1.getFoundTexts(); List<PDFText> foundTexts1 = textFinder1.getFoundTexts();
assertEquals(1, foundTexts1.size(), assertEquals(
"Should find only the standalone '1' at the beginning"); 1,
foundTexts1.size(),
"Should find only the standalone '1'; do not count the '1' in '1.0' or in 'Item1'.");
TextFinder textFinder2 = new TextFinder("2", false, true); TextFinder textFinder2 = new TextFinder("2", false, true);
textFinder2.getText(document); textFinder2.getText(document);
List<PDFText> foundTexts2 = textFinder2.getFoundTexts(); List<PDFText> foundTexts2 = textFinder2.getFoundTexts();
assertEquals(1, foundTexts2.size(), assertEquals(
"Should find only the standalone '2' in the number list"); 1,
foundTexts2.size(),
"Should find only the standalone '2' in the number list");
} }
@Test @Test
@ -566,8 +644,10 @@ class TextFinderTest {
textFinder.getText(document); textFinder.getText(document);
List<PDFText> foundTexts = textFinder.getFoundTexts(); List<PDFText> foundTexts = textFinder.getFoundTexts();
assertEquals(2, foundTexts.size(), assertEquals(
"Should find both '1' instances despite spacing variations"); 2,
foundTexts.size(),
"Should find both '1' instances despite spacing variations");
} }
} }