From 2fff3083ae9d608397582abbde86cff0e79d123d Mon Sep 17 00:00:00 2001
From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
Date: Tue, 26 Mar 2024 19:25:16 +0000
Subject: [PATCH] Update TextFinder.java (#980)

---
 .../software/SPDF/pdf/TextFinder.java         | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
index cdfb55017..f9e339c2f 100644
--- a/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
+++ b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
@@ -19,6 +19,16 @@ public class TextFinder extends PDFTextStripper {
     private final boolean wholeWordSearch;
     private final List<PDFText> textOccurrences = new ArrayList<>();
 
+    private class MatchInfo {
+        int startIndex;
+        int matchLength;
+
+        MatchInfo(int startIndex, int matchLength) {
+            this.startIndex = startIndex;
+            this.matchLength = matchLength;
+        }
+    }
+
     public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
             throws IOException {
         this.searchText = searchText.toLowerCase();
@@ -27,36 +37,37 @@ public class TextFinder extends PDFTextStripper {
         setSortByPosition(true);
     }
 
-    private List<Integer> findOccurrencesInText(String searchText, String content) {
-        List<Integer> indexes = new ArrayList<>();
+    private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
+        List<MatchInfo> matches = new ArrayList<>();
+
         Pattern pattern;
 
         if (useRegex) {
             // Use regex-based search
             pattern =
                     wholeWordSearch
-                            ? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)")
+                            ? Pattern.compile("\\b" + searchText + "\\b")
                             : Pattern.compile(searchText);
         } else {
             // Use normal text search
             pattern =
                     wholeWordSearch
-                            ? Pattern.compile(
-                                    "(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)")
+                            ? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
                             : Pattern.compile(Pattern.quote(searchText));
         }
 
         Matcher matcher = pattern.matcher(content);
         while (matcher.find()) {
-            indexes.add(matcher.start());
+            matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
         }
-        return indexes;
+        return matches;
     }
 
     @Override
     protected void writeString(String text, List<TextPosition> textPositions) {
-        for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) {
-            if (index + searchText.length() <= textPositions.size()) {
+        for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
+            int index = match.startIndex;
+            if (index + match.matchLength <= textPositions.size()) {
                 // Initial values based on the first character
                 TextPosition first = textPositions.get(index);
                 float minX = first.getX();
@@ -65,7 +76,7 @@ public class TextFinder extends PDFTextStripper {
                 float maxY = first.getY() + first.getHeight();
 
                 // Loop over the rest of the characters and adjust bounding box values
-                for (int i = index; i < index + searchText.length(); i++) {
+                for (int i = index; i < index + match.matchLength; i++) {
                     TextPosition position = textPositions.get(i);
                     minX = Math.min(minX, position.getX());
                     minY = Math.min(minY, position.getY());