Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

108 lines
3.7 KiB
Java
Raw Normal View History

2023-08-24 23:23:25 +01:00
package stirling.software.SPDF.pdf;
2023-12-30 19:11:27 +00:00
2023-08-24 23:23:25 +01:00
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
2023-08-27 00:39:22 +01:00
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
2024-10-14 22:34:41 +01:00
import lombok.extern.slf4j.Slf4j;
Dynamic paths for tools and removal of unused book endpoints (#3018) # Description of Changes This pull request includes several changes primarily focused on improving configuration management, removing deprecated methods, and updating paths for external dependencies. The most important changes are summarized below: ### Configuration Management Improvements: * Added a new `RuntimePathConfig` class to manage dynamic paths for operations and pipeline configurations (`src/main/java/stirling/software/SPDF/config/RuntimePathConfig.java`). * Removed the `bookAndHtmlFormatsInstalled` bean and its associated logic from `AppConfig` and `EndpointConfiguration` (`src/main/java/stirling/software/SPDF/config/AppConfig.java`, `src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java`). [[1]](diffhunk://#diff-4d774ec79aa55750c0a4739bee971b68877078b73654e863fd40ee924347e143L130-L138) [[2]](diffhunk://#diff-750f31f6ecbd64b025567108a33775cad339e835a04360affff82a09410b697dL12-L35) [[3]](diffhunk://#diff-750f31f6ecbd64b025567108a33775cad339e835a04360affff82a09410b697dL275-L280) ### External Dependency Path Updates: * Updated paths for `weasyprint` and `unoconvert` in `ExternalAppDepConfig` to use values from `RuntimePathConfig` (`src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java`). [[1]](diffhunk://#diff-c47af298c07c2622aa98b038b78822c56bdb002de71081e102d344794e7832a6R12-L33) [[2]](diffhunk://#diff-c47af298c07c2622aa98b038b78822c56bdb002de71081e102d344794e7832a6L104-R115) ### Minor Adjustments: * Corrected a typo from "Unoconv" to "Unoconvert" in `EndpointConfiguration` (`src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java`). --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
2025-02-23 13:36:21 +00:00
2023-08-27 00:39:22 +01:00
import stirling.software.SPDF.model.PDFText;
2024-10-14 22:34:41 +01:00
@Slf4j
2023-08-24 23:23:25 +01:00
public class TextFinder extends PDFTextStripper {
private final String searchText;
private final boolean useRegex;
private final boolean wholeWordSearch;
private final List<PDFText> textOccurrences = new ArrayList<>();
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
throws IOException {
this.searchText = searchText.toLowerCase();
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
setSortByPosition(true);
}
2024-03-26 19:25:16 +00:00
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
List<MatchInfo> matches = new ArrayList<>();
2023-08-24 23:23:25 +01:00
Pattern pattern;
if (useRegex) {
// Use regex-based search
pattern =
wholeWordSearch
2024-03-26 19:25:16 +00:00
? Pattern.compile("\\b" + searchText + "\\b")
2023-08-24 23:23:25 +01:00
: Pattern.compile(searchText);
} else {
// Use normal text search
pattern =
wholeWordSearch
2024-03-26 19:25:16 +00:00
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
2023-08-24 23:23:25 +01:00
: Pattern.compile(Pattern.quote(searchText));
}
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
2024-03-26 19:25:16 +00:00
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
2023-08-24 23:23:25 +01:00
}
2024-03-26 19:25:16 +00:00
return matches;
2023-08-24 23:23:25 +01:00
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
2024-03-26 19:25:16 +00:00
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
int index = match.startIndex;
if (index + match.matchLength <= textPositions.size()) {
2023-08-24 23:23:25 +01:00
// Initial values based on the first character
TextPosition first = textPositions.get(index);
float minX = first.getX();
float minY = first.getY();
float maxX = first.getX() + first.getWidth();
float maxY = first.getY() + first.getHeight();
// Loop over the rest of the characters and adjust bounding box values
2024-03-26 19:25:16 +00:00
for (int i = index; i < index + match.matchLength; i++) {
2023-08-24 23:23:25 +01:00
TextPosition position = textPositions.get(i);
minX = Math.min(minX, position.getX());
minY = Math.min(minY, position.getY());
maxX = Math.max(maxX, position.getX() + position.getWidth());
maxY = Math.max(maxY, position.getY() + position.getHeight());
}
textOccurrences.add(
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
2023-12-30 19:11:27 +00:00
}
}
}
2023-08-24 23:23:25 +01:00
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
this.getText(document);
2024-10-14 22:34:41 +01:00
log.debug(
2023-12-30 19:11:27 +00:00
"Found "
2023-08-24 23:23:25 +01:00
+ textOccurrences.size()
+ " occurrences of '"
+ searchText
+ "' in the document.");
return textOccurrences;
}
private class MatchInfo {
int startIndex;
int matchLength;
MatchInfo(int startIndex, int matchLength) {
this.startIndex = startIndex;
this.matchLength = matchLength;
}
}
2023-12-30 19:11:27 +00:00
}