Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

319 lines
14 KiB
Java
Raw Normal View History

package stirling.software.SPDF.utils;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.multipart.MultipartFile;
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
import com.vladsch.flexmark.util.data.MutableDataSet;
import io.github.pixee.security.Filenames;
2024-06-02 12:02:01 +01:00
import lombok.extern.slf4j.Slf4j;
Dynamic paths for tools and removal of unused book endpoints (#3018) # Description of Changes This pull request includes several changes primarily focused on improving configuration management, removing deprecated methods, and updating paths for external dependencies. The most important changes are summarized below: ### Configuration Management Improvements: * Added a new `RuntimePathConfig` class to manage dynamic paths for operations and pipeline configurations (`src/main/java/stirling/software/SPDF/config/RuntimePathConfig.java`). * Removed the `bookAndHtmlFormatsInstalled` bean and its associated logic from `AppConfig` and `EndpointConfiguration` (`src/main/java/stirling/software/SPDF/config/AppConfig.java`, `src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java`). [[1]](diffhunk://#diff-4d774ec79aa55750c0a4739bee971b68877078b73654e863fd40ee924347e143L130-L138) [[2]](diffhunk://#diff-750f31f6ecbd64b025567108a33775cad339e835a04360affff82a09410b697dL12-L35) [[3]](diffhunk://#diff-750f31f6ecbd64b025567108a33775cad339e835a04360affff82a09410b697dL275-L280) ### External Dependency Path Updates: * Updated paths for `weasyprint` and `unoconvert` in `ExternalAppDepConfig` to use values from `RuntimePathConfig` (`src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java`). [[1]](diffhunk://#diff-c47af298c07c2622aa98b038b78822c56bdb002de71081e102d344794e7832a6R12-L33) [[2]](diffhunk://#diff-c47af298c07c2622aa98b038b78822c56bdb002de71081e102d344794e7832a6L104-R115) ### Minor Adjustments: * Corrected a typo from "Unoconv" to "Unoconvert" in `EndpointConfiguration` (`src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java`). --- ## Checklist ### General - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [ ] I have performed a self-review of my own code - [ ] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
2025-02-23 13:36:21 +00:00
2023-07-29 13:53:30 +01:00
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
@Slf4j
public class PDFToFile {
public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!"application/pdf".equals(inputFile.getContentType())) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
MutableDataSet options =
new MutableDataSet()
.set(
FlexmarkHtmlConverter.MAX_BLANK_LINES,
2) // Control max consecutive blank lines
.set(
FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,
1) // Control trailing blank lines
.set(
FlexmarkHtmlConverter.SETEXT_HEADINGS,
true) // Use Setext headings for h1 and h2
.set(
FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,
false) // Don't output HTML for unknown tags
.set(
FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,
true) // Convert quotation marks
.set(
FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,
true) // Convert <br> to paragraph breaks
.set(FlexmarkHtmlConverter.CODE_INDENT, " "); // Indent for code blocks
FlexmarkHtmlConverter htmlToMarkdownConverter =
FlexmarkHtmlConverter.builder(options).build();
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
String pdfBaseName = originalPdfFileName;
if (originalPdfFileName.contains(".")) {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes;
String fileName = "temp.file";
try {
tempInputFile = Files.createTempFile("input_", ".pdf");
inputFile.transferTo(tempInputFile);
tempOutputDir = Files.createTempDirectory("output_");
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml",
"-s",
"-noframes",
"-c",
tempInputFile.toString(),
pdfBaseName));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
// Process HTML files to Markdown
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
List<File> markdownFiles = new ArrayList<>();
// Convert HTML files to Markdown
for (File outputFile : outputFiles) {
if (outputFile.getName().endsWith(".html")) {
String html = Files.readString(outputFile.toPath());
String markdown = htmlToMarkdownConverter.convert(html);
String mdFileName = outputFile.getName().replace(".html", ".md");
File mdFile = new File(tempOutputDir.toFile(), mdFileName);
Files.writeString(mdFile.toPath(), markdown);
markdownFiles.add(mdFile);
}
}
// If there's only one markdown file, return it directly
if (markdownFiles.size() == 1) {
fileName = pdfBaseName + ".md";
fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
} else {
// Multiple files - create a zip
fileName = pdfBaseName + "ToMarkdown.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
// Add markdown files
for (File mdFile : markdownFiles) {
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
zipOutputStream.putNextEntry(mdEntry);
Files.copy(mdFile.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
// Add images and other assets
for (File file : outputFiles) {
if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
ZipEntry assetEntry = new ZipEntry(file.getName());
zipOutputStream.putNextEntry(assetEntry);
Files.copy(file.toPath(), zipOutputStream);
zipOutputStream.closeEntry();
}
}
}
fileBytes = byteArrayOutputStream.toByteArray();
}
} finally {
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
throws IOException, InterruptedException {
if (!"application/pdf".equals(inputFile.getContentType())) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
// Get the original PDF file name without the extension
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
2024-05-26 15:31:34 +01:00
String pdfBaseName = originalPdfFileName;
if (originalPdfFileName.contains(".")) {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes;
String fileName = "temp.file";
try {
// Save the uploaded file to a temporary location
tempInputFile = Files.createTempFile("input_", ".pdf");
2024-05-27 17:53:18 +01:00
inputFile.transferTo(tempInputFile);
// Prepare the output directory
tempOutputDir = Files.createTempDirectory("output_");
// Run the pdftohtml command with complex output
List<String> command =
new ArrayList<>(
Arrays.asList(
"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
// Get output files
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
// Return output files in a ZIP archive
fileName = pdfBaseName + "ToHtml.zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
2024-06-02 11:59:43 +01:00
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
for (File outputFile : outputFiles) {
ZipEntry entry = new ZipEntry(outputFile.getName());
zipOutputStream.putNextEntry(entry);
try (FileInputStream fis = new FileInputStream(outputFile)) {
IOUtils.copy(fis, zipOutputStream);
} catch (IOException e) {
log.error("Exception writing zip entry", e);
2024-06-02 11:59:43 +01:00
}
zipOutputStream.closeEntry();
}
} catch (IOException e) {
log.error("Exception writing zip", e);
}
fileBytes = byteArrayOutputStream.toByteArray();
} finally {
// Clean up the temporary files
2024-05-27 17:53:18 +01:00
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
}
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
public ResponseEntity<byte[]> processPdfToOfficeFormat(
MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
throws IOException, InterruptedException {
if (!"application/pdf".equals(inputFile.getContentType())) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
// Get the original PDF file name without the extension
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
2024-05-26 15:31:34 +01:00
if (originalPdfFileName == null || "".equals(originalPdfFileName.trim())) {
originalPdfFileName = "output.pdf";
}
// Assume file is pdf if no extension
String pdfBaseName = originalPdfFileName;
if (originalPdfFileName.contains(".")) {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
// Validate output format
List<String> allowedFormats =
Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
if (!allowedFormats.contains(outputFormat)) {
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
}
Path tempInputFile = null;
Path tempOutputDir = null;
byte[] fileBytes;
2023-05-01 21:57:48 +01:00
String fileName = "temp.file";
try {
// Save the uploaded file to a temporary location
tempInputFile = Files.createTempFile("input_", ".pdf");
2024-05-27 17:53:18 +01:00
inputFile.transferTo(tempInputFile);
// Prepare the output directory
tempOutputDir = Files.createTempDirectory("output_");
// Run the LibreOffice command
List<String> command =
new ArrayList<>(
Arrays.asList(
"soffice",
"--headless",
"--nologo",
"--infilter=" + libreOfficeFilter,
"--convert-to",
outputFormat,
"--outdir",
tempOutputDir.toString(),
tempInputFile.toString()));
2023-07-29 13:53:30 +01:00
ProcessExecutorResult returnCode =
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
.runCommandWithOutputHandling(command);
// Get output files
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
if (outputFiles.size() == 1) {
// Return single output file
File outputFile = outputFiles.get(0);
if ("txt:Text".equals(outputFormat)) {
outputFormat = "txt";
}
2023-05-01 21:57:48 +01:00
fileName = pdfBaseName + "." + outputFormat;
fileBytes = FileUtils.readFileToByteArray(outputFile);
} else {
// Return output files in a ZIP archive
2024-06-02 12:02:01 +01:00
fileName = pdfBaseName + "To" + outputFormat + ".zip";
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
for (File outputFile : outputFiles) {
ZipEntry entry = new ZipEntry(outputFile.getName());
zipOutputStream.putNextEntry(entry);
try (FileInputStream fis = new FileInputStream(outputFile)) {
IOUtils.copy(fis, zipOutputStream);
} catch (IOException e) {
log.error("Exception writing zip entry", e);
2024-06-02 12:02:01 +01:00
}
zipOutputStream.closeEntry();
}
} catch (IOException e) {
log.error("Exception writing zip", e);
2024-06-02 12:02:01 +01:00
}
fileBytes = byteArrayOutputStream.toByteArray();
}
} finally {
// Clean up the temporary files
2024-05-27 17:53:18 +01:00
Files.deleteIfExists(tempInputFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
}
2023-05-31 20:15:48 +01:00
return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
}
}