mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-06-23 16:05:09 +00:00

# Description of Changes This pull request includes several changes primarily focused on improving configuration management, removing deprecated methods, and updating paths for external dependencies. The most important changes are summarized below: ### Configuration Management Improvements: * Added a new `RuntimePathConfig` class to manage dynamic paths for operations and pipeline configurations (`src/main/java/stirling/software/SPDF/config/RuntimePathConfig.java`). * Removed the `bookAndHtmlFormatsInstalled` bean and its associated logic from `AppConfig` and `EndpointConfiguration` (`src/main/java/stirling/software/SPDF/config/AppConfig.java`, `src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java`). [[1]](diffhunk://#diff-4d774ec79aa55750c0a4739bee971b68877078b73654e863fd40ee924347e143L130-L138) [[2]](diffhunk://#diff-750f31f6ecbd64b025567108a33775cad339e835a04360affff82a09410b697dL12-L35) [[3]](diffhunk://#diff-750f31f6ecbd64b025567108a33775cad339e835a04360affff82a09410b697dL275-L280) ### External Dependency Path Updates: * Updated paths for `weasyprint` and `unoconvert` in `ExternalAppDepConfig` to use values from `RuntimePathConfig` (`src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java`). [[1]](diffhunk://#diff-c47af298c07c2622aa98b038b78822c56bdb002de71081e102d344794e7832a6R12-L33) [[2]](diffhunk://#diff-c47af298c07c2622aa98b038b78822c56bdb002de71081e102d344794e7832a6L104-R115) ### Minor Adjustments: * Corrected a typo from "Unoconv" to "Unoconvert" in `EndpointConfiguration` (`src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java`). --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
319 lines
14 KiB
Java
319 lines
14 KiB
Java
package stirling.software.SPDF.utils;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.util.ArrayList;
|
|
import java.util.Arrays;
|
|
import java.util.List;
|
|
import java.util.Objects;
|
|
import java.util.zip.ZipEntry;
|
|
import java.util.zip.ZipOutputStream;
|
|
|
|
import org.apache.commons.io.FileUtils;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.springframework.http.HttpStatus;
|
|
import org.springframework.http.MediaType;
|
|
import org.springframework.http.ResponseEntity;
|
|
import org.springframework.web.multipart.MultipartFile;
|
|
|
|
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
|
|
import com.vladsch.flexmark.util.data.MutableDataSet;
|
|
|
|
import io.github.pixee.security.Filenames;
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
|
|
|
@Slf4j
|
|
public class PDFToFile {
|
|
|
|
public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
|
|
throws IOException, InterruptedException {
|
|
if (!"application/pdf".equals(inputFile.getContentType())) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
MutableDataSet options =
|
|
new MutableDataSet()
|
|
.set(
|
|
FlexmarkHtmlConverter.MAX_BLANK_LINES,
|
|
2) // Control max consecutive blank lines
|
|
.set(
|
|
FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,
|
|
1) // Control trailing blank lines
|
|
.set(
|
|
FlexmarkHtmlConverter.SETEXT_HEADINGS,
|
|
true) // Use Setext headings for h1 and h2
|
|
.set(
|
|
FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,
|
|
false) // Don't output HTML for unknown tags
|
|
.set(
|
|
FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,
|
|
true) // Convert quotation marks
|
|
.set(
|
|
FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,
|
|
true) // Convert <br> to paragraph breaks
|
|
.set(FlexmarkHtmlConverter.CODE_INDENT, " "); // Indent for code blocks
|
|
|
|
FlexmarkHtmlConverter htmlToMarkdownConverter =
|
|
FlexmarkHtmlConverter.builder(options).build();
|
|
|
|
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
|
String pdfBaseName = originalPdfFileName;
|
|
if (originalPdfFileName.contains(".")) {
|
|
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
|
}
|
|
|
|
Path tempInputFile = null;
|
|
Path tempOutputDir = null;
|
|
byte[] fileBytes;
|
|
String fileName = "temp.file";
|
|
|
|
try {
|
|
tempInputFile = Files.createTempFile("input_", ".pdf");
|
|
inputFile.transferTo(tempInputFile);
|
|
tempOutputDir = Files.createTempDirectory("output_");
|
|
|
|
List<String> command =
|
|
new ArrayList<>(
|
|
Arrays.asList(
|
|
"pdftohtml",
|
|
"-s",
|
|
"-noframes",
|
|
"-c",
|
|
tempInputFile.toString(),
|
|
pdfBaseName));
|
|
|
|
ProcessExecutorResult returnCode =
|
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
|
|
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
|
|
// Process HTML files to Markdown
|
|
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
|
|
List<File> markdownFiles = new ArrayList<>();
|
|
|
|
// Convert HTML files to Markdown
|
|
for (File outputFile : outputFiles) {
|
|
if (outputFile.getName().endsWith(".html")) {
|
|
String html = Files.readString(outputFile.toPath());
|
|
String markdown = htmlToMarkdownConverter.convert(html);
|
|
|
|
String mdFileName = outputFile.getName().replace(".html", ".md");
|
|
File mdFile = new File(tempOutputDir.toFile(), mdFileName);
|
|
Files.writeString(mdFile.toPath(), markdown);
|
|
markdownFiles.add(mdFile);
|
|
}
|
|
}
|
|
|
|
// If there's only one markdown file, return it directly
|
|
if (markdownFiles.size() == 1) {
|
|
fileName = pdfBaseName + ".md";
|
|
fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
|
|
} else {
|
|
// Multiple files - create a zip
|
|
fileName = pdfBaseName + "ToMarkdown.zip";
|
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
|
|
|
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
|
// Add markdown files
|
|
for (File mdFile : markdownFiles) {
|
|
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
|
|
zipOutputStream.putNextEntry(mdEntry);
|
|
Files.copy(mdFile.toPath(), zipOutputStream);
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
|
|
// Add images and other assets
|
|
for (File file : outputFiles) {
|
|
if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
|
|
ZipEntry assetEntry = new ZipEntry(file.getName());
|
|
zipOutputStream.putNextEntry(assetEntry);
|
|
Files.copy(file.toPath(), zipOutputStream);
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
}
|
|
}
|
|
|
|
fileBytes = byteArrayOutputStream.toByteArray();
|
|
}
|
|
|
|
} finally {
|
|
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
|
|
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
|
}
|
|
return WebResponseUtils.bytesToWebResponse(
|
|
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
|
}
|
|
|
|
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
|
|
throws IOException, InterruptedException {
|
|
if (!"application/pdf".equals(inputFile.getContentType())) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
// Get the original PDF file name without the extension
|
|
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
|
String pdfBaseName = originalPdfFileName;
|
|
if (originalPdfFileName.contains(".")) {
|
|
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
|
}
|
|
|
|
Path tempInputFile = null;
|
|
Path tempOutputDir = null;
|
|
byte[] fileBytes;
|
|
String fileName = "temp.file";
|
|
|
|
try {
|
|
// Save the uploaded file to a temporary location
|
|
tempInputFile = Files.createTempFile("input_", ".pdf");
|
|
inputFile.transferTo(tempInputFile);
|
|
|
|
// Prepare the output directory
|
|
tempOutputDir = Files.createTempDirectory("output_");
|
|
|
|
// Run the pdftohtml command with complex output
|
|
List<String> command =
|
|
new ArrayList<>(
|
|
Arrays.asList(
|
|
"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
|
|
|
|
ProcessExecutorResult returnCode =
|
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
|
|
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
|
|
|
|
// Get output files
|
|
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
|
|
|
|
// Return output files in a ZIP archive
|
|
fileName = pdfBaseName + "ToHtml.zip";
|
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
|
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
|
for (File outputFile : outputFiles) {
|
|
ZipEntry entry = new ZipEntry(outputFile.getName());
|
|
zipOutputStream.putNextEntry(entry);
|
|
try (FileInputStream fis = new FileInputStream(outputFile)) {
|
|
IOUtils.copy(fis, zipOutputStream);
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip entry", e);
|
|
}
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip", e);
|
|
}
|
|
fileBytes = byteArrayOutputStream.toByteArray();
|
|
|
|
} finally {
|
|
// Clean up the temporary files
|
|
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
|
|
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
|
}
|
|
|
|
return WebResponseUtils.bytesToWebResponse(
|
|
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
|
}
|
|
|
|
public ResponseEntity<byte[]> processPdfToOfficeFormat(
|
|
MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
|
|
throws IOException, InterruptedException {
|
|
|
|
if (!"application/pdf".equals(inputFile.getContentType())) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
// Get the original PDF file name without the extension
|
|
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
|
|
|
if (originalPdfFileName == null || "".equals(originalPdfFileName.trim())) {
|
|
originalPdfFileName = "output.pdf";
|
|
}
|
|
// Assume file is pdf if no extension
|
|
String pdfBaseName = originalPdfFileName;
|
|
if (originalPdfFileName.contains(".")) {
|
|
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
|
}
|
|
// Validate output format
|
|
List<String> allowedFormats =
|
|
Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
|
|
if (!allowedFormats.contains(outputFormat)) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
Path tempInputFile = null;
|
|
Path tempOutputDir = null;
|
|
byte[] fileBytes;
|
|
String fileName = "temp.file";
|
|
|
|
try {
|
|
// Save the uploaded file to a temporary location
|
|
tempInputFile = Files.createTempFile("input_", ".pdf");
|
|
inputFile.transferTo(tempInputFile);
|
|
|
|
// Prepare the output directory
|
|
tempOutputDir = Files.createTempDirectory("output_");
|
|
|
|
// Run the LibreOffice command
|
|
List<String> command =
|
|
new ArrayList<>(
|
|
Arrays.asList(
|
|
"soffice",
|
|
"--headless",
|
|
"--nologo",
|
|
"--infilter=" + libreOfficeFilter,
|
|
"--convert-to",
|
|
outputFormat,
|
|
"--outdir",
|
|
tempOutputDir.toString(),
|
|
tempInputFile.toString()));
|
|
ProcessExecutorResult returnCode =
|
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
|
|
.runCommandWithOutputHandling(command);
|
|
|
|
// Get output files
|
|
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
|
|
|
|
if (outputFiles.size() == 1) {
|
|
// Return single output file
|
|
File outputFile = outputFiles.get(0);
|
|
if ("txt:Text".equals(outputFormat)) {
|
|
outputFormat = "txt";
|
|
}
|
|
fileName = pdfBaseName + "." + outputFormat;
|
|
fileBytes = FileUtils.readFileToByteArray(outputFile);
|
|
} else {
|
|
// Return output files in a ZIP archive
|
|
fileName = pdfBaseName + "To" + outputFormat + ".zip";
|
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
|
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
|
for (File outputFile : outputFiles) {
|
|
ZipEntry entry = new ZipEntry(outputFile.getName());
|
|
zipOutputStream.putNextEntry(entry);
|
|
try (FileInputStream fis = new FileInputStream(outputFile)) {
|
|
IOUtils.copy(fis, zipOutputStream);
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip entry", e);
|
|
}
|
|
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip", e);
|
|
}
|
|
|
|
fileBytes = byteArrayOutputStream.toByteArray();
|
|
}
|
|
|
|
} finally {
|
|
// Clean up the temporary files
|
|
Files.deleteIfExists(tempInputFile);
|
|
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
|
}
|
|
return WebResponseUtils.bytesToWebResponse(
|
|
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
|
}
|
|
}
|