mirror of
https://github.com/Stirling-Tools/Stirling-PDF.git
synced 2025-05-21 17:32:01 +00:00

# Description of Changes - **What was changed** - Updated controller methods to use strongly‐typed primitives (`int`, `long`, `boolean`) instead of `String` for numeric and boolean parameters, eliminating calls to `Integer.parseInt`/`Long.parseLong` and improving null‐safety (`Boolean.TRUE.equals(...)`). - Enhanced all API request model classes with richer Swagger/OpenAPI annotations: added `requiredMode`, `defaultValue`, `allowableValues`, `format`, `pattern`, and tightened schema descriptions for all fields. - Refactored HTML form templates for “Remove Blank Pages” to include `min`, `max`, and `step` attributes on numeric inputs, matching the updated validation rules. - **Why the change was made** - **Type safety & robustness**: Shifting from `String` to native types prevents runtime parsing errors, simplifies controller logic, and makes default values explicit. - **Better API documentation & validation**: Enriching the Swagger annotations ensures generated docs accurately reflect required fields, default values, and permitted ranges, which improves client code generation and developer experience. - **Consistency across codebase**: Aligning all request models and controllers enforces a uniform coding style and reduces bugs. #3406 --- ## Checklist ### General - [x] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [x] I have read the [Stirling-PDF Developer Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md) (if applicable) - [ ] I have read the [How to add new languages to Stirling-PDF](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md) (if applicable) - [x] I have performed a self-review of my own code - [x] My changes generate no new warnings ### Documentation - [ ] I have updated relevant docs on [Stirling-PDF's doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) (if functionality has heavily changed) - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) ### UI Changes (if applicable) - [ ] Screenshots or videos demonstrating the UI changes are attached (e.g., as comments or direct attachments in the PR) ### Testing (if applicable) - [ ] I have tested my changes locally. Refer to the [Testing Guide](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/DeveloperGuide.md#6-testing) for more details.
321 lines
14 KiB
Java
321 lines
14 KiB
Java
package stirling.software.SPDF.utils;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.util.ArrayList;
|
|
import java.util.Arrays;
|
|
import java.util.List;
|
|
import java.util.Objects;
|
|
import java.util.zip.ZipEntry;
|
|
import java.util.zip.ZipOutputStream;
|
|
|
|
import org.apache.commons.io.FileUtils;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.springframework.http.HttpStatus;
|
|
import org.springframework.http.MediaType;
|
|
import org.springframework.http.ResponseEntity;
|
|
import org.springframework.web.multipart.MultipartFile;
|
|
|
|
import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
|
|
import com.vladsch.flexmark.util.data.MutableDataSet;
|
|
|
|
import io.github.pixee.security.Filenames;
|
|
|
|
import lombok.NoArgsConstructor;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
|
|
|
@Slf4j
|
|
@NoArgsConstructor
|
|
public class PDFToFile {
|
|
|
|
public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
|
|
throws IOException, InterruptedException {
|
|
if (!"application/pdf".equals(inputFile.getContentType())) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
MutableDataSet options =
|
|
new MutableDataSet()
|
|
.set(
|
|
FlexmarkHtmlConverter.MAX_BLANK_LINES,
|
|
2) // Control max consecutive blank lines
|
|
.set(
|
|
FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,
|
|
1) // Control trailing blank lines
|
|
.set(
|
|
FlexmarkHtmlConverter.SETEXT_HEADINGS,
|
|
true) // Use Setext headings for h1 and h2
|
|
.set(
|
|
FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,
|
|
false) // Don't output HTML for unknown tags
|
|
.set(
|
|
FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,
|
|
true) // Convert quotation marks
|
|
.set(
|
|
FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,
|
|
true) // Convert <br> to paragraph breaks
|
|
.set(FlexmarkHtmlConverter.CODE_INDENT, " "); // Indent for code blocks
|
|
|
|
FlexmarkHtmlConverter htmlToMarkdownConverter =
|
|
FlexmarkHtmlConverter.builder(options).build();
|
|
|
|
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
|
String pdfBaseName = originalPdfFileName;
|
|
if (originalPdfFileName.contains(".")) {
|
|
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
|
}
|
|
|
|
Path tempInputFile = null;
|
|
Path tempOutputDir = null;
|
|
byte[] fileBytes;
|
|
String fileName = "temp.file";
|
|
|
|
try {
|
|
tempInputFile = Files.createTempFile("input_", ".pdf");
|
|
inputFile.transferTo(tempInputFile);
|
|
tempOutputDir = Files.createTempDirectory("output_");
|
|
|
|
List<String> command =
|
|
new ArrayList<>(
|
|
Arrays.asList(
|
|
"pdftohtml",
|
|
"-s",
|
|
"-noframes",
|
|
"-c",
|
|
tempInputFile.toString(),
|
|
pdfBaseName));
|
|
|
|
ProcessExecutorResult returnCode =
|
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
|
|
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
|
|
// Process HTML files to Markdown
|
|
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
|
|
List<File> markdownFiles = new ArrayList<>();
|
|
|
|
// Convert HTML files to Markdown
|
|
for (File outputFile : outputFiles) {
|
|
if (outputFile.getName().endsWith(".html")) {
|
|
String html = Files.readString(outputFile.toPath());
|
|
String markdown = htmlToMarkdownConverter.convert(html);
|
|
|
|
String mdFileName = outputFile.getName().replace(".html", ".md");
|
|
File mdFile = new File(tempOutputDir.toFile(), mdFileName);
|
|
Files.writeString(mdFile.toPath(), markdown);
|
|
markdownFiles.add(mdFile);
|
|
}
|
|
}
|
|
|
|
// If there's only one markdown file, return it directly
|
|
if (markdownFiles.size() == 1) {
|
|
fileName = pdfBaseName + ".md";
|
|
fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
|
|
} else {
|
|
// Multiple files - create a zip
|
|
fileName = pdfBaseName + "ToMarkdown.zip";
|
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
|
|
|
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
|
// Add markdown files
|
|
for (File mdFile : markdownFiles) {
|
|
ZipEntry mdEntry = new ZipEntry(mdFile.getName());
|
|
zipOutputStream.putNextEntry(mdEntry);
|
|
Files.copy(mdFile.toPath(), zipOutputStream);
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
|
|
// Add images and other assets
|
|
for (File file : outputFiles) {
|
|
if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
|
|
ZipEntry assetEntry = new ZipEntry(file.getName());
|
|
zipOutputStream.putNextEntry(assetEntry);
|
|
Files.copy(file.toPath(), zipOutputStream);
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
}
|
|
}
|
|
|
|
fileBytes = byteArrayOutputStream.toByteArray();
|
|
}
|
|
|
|
} finally {
|
|
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
|
|
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
|
}
|
|
return WebResponseUtils.bytesToWebResponse(
|
|
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
|
}
|
|
|
|
public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
|
|
throws IOException, InterruptedException {
|
|
if (!"application/pdf".equals(inputFile.getContentType())) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
// Get the original PDF file name without the extension
|
|
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
|
String pdfBaseName = originalPdfFileName;
|
|
if (originalPdfFileName.contains(".")) {
|
|
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
|
}
|
|
|
|
Path tempInputFile = null;
|
|
Path tempOutputDir = null;
|
|
byte[] fileBytes;
|
|
String fileName = "temp.file";
|
|
|
|
try {
|
|
// Save the uploaded file to a temporary location
|
|
tempInputFile = Files.createTempFile("input_", ".pdf");
|
|
inputFile.transferTo(tempInputFile);
|
|
|
|
// Prepare the output directory
|
|
tempOutputDir = Files.createTempDirectory("output_");
|
|
|
|
// Run the pdftohtml command with complex output
|
|
List<String> command =
|
|
new ArrayList<>(
|
|
Arrays.asList(
|
|
"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));
|
|
|
|
ProcessExecutorResult returnCode =
|
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
|
|
.runCommandWithOutputHandling(command, tempOutputDir.toFile());
|
|
|
|
// Get output files
|
|
File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
|
|
|
|
// Return output files in a ZIP archive
|
|
fileName = pdfBaseName + "ToHtml.zip";
|
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
|
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
|
for (File outputFile : outputFiles) {
|
|
ZipEntry entry = new ZipEntry(outputFile.getName());
|
|
zipOutputStream.putNextEntry(entry);
|
|
try (FileInputStream fis = new FileInputStream(outputFile)) {
|
|
IOUtils.copy(fis, zipOutputStream);
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip entry", e);
|
|
}
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip", e);
|
|
}
|
|
fileBytes = byteArrayOutputStream.toByteArray();
|
|
|
|
} finally {
|
|
// Clean up the temporary files
|
|
if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
|
|
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
|
}
|
|
|
|
return WebResponseUtils.bytesToWebResponse(
|
|
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
|
}
|
|
|
|
public ResponseEntity<byte[]> processPdfToOfficeFormat(
|
|
MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
|
|
throws IOException, InterruptedException {
|
|
|
|
if (!"application/pdf".equals(inputFile.getContentType())) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
// Get the original PDF file name without the extension
|
|
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
|
|
|
|
if (originalPdfFileName == null || "".equals(originalPdfFileName.trim())) {
|
|
originalPdfFileName = "output.pdf";
|
|
}
|
|
// Assume file is pdf if no extension
|
|
String pdfBaseName = originalPdfFileName;
|
|
if (originalPdfFileName.contains(".")) {
|
|
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
|
|
}
|
|
// Validate output format
|
|
List<String> allowedFormats =
|
|
Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
|
|
if (!allowedFormats.contains(outputFormat)) {
|
|
return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
|
|
}
|
|
|
|
Path tempInputFile = null;
|
|
Path tempOutputDir = null;
|
|
byte[] fileBytes;
|
|
String fileName = "temp.file";
|
|
|
|
try {
|
|
// Save the uploaded file to a temporary location
|
|
tempInputFile = Files.createTempFile("input_", ".pdf");
|
|
inputFile.transferTo(tempInputFile);
|
|
|
|
// Prepare the output directory
|
|
tempOutputDir = Files.createTempDirectory("output_");
|
|
|
|
// Run the LibreOffice command
|
|
List<String> command =
|
|
new ArrayList<>(
|
|
Arrays.asList(
|
|
"soffice",
|
|
"--headless",
|
|
"--nologo",
|
|
"--infilter=" + libreOfficeFilter,
|
|
"--convert-to",
|
|
outputFormat,
|
|
"--outdir",
|
|
tempOutputDir.toString(),
|
|
tempInputFile.toString()));
|
|
ProcessExecutorResult returnCode =
|
|
ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
|
|
.runCommandWithOutputHandling(command);
|
|
|
|
// Get output files
|
|
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
|
|
|
|
if (outputFiles.size() == 1) {
|
|
// Return single output file
|
|
File outputFile = outputFiles.get(0);
|
|
if ("txt:Text".equals(outputFormat)) {
|
|
outputFormat = "txt";
|
|
}
|
|
fileName = pdfBaseName + "." + outputFormat;
|
|
fileBytes = FileUtils.readFileToByteArray(outputFile);
|
|
} else {
|
|
// Return output files in a ZIP archive
|
|
fileName = pdfBaseName + "To" + outputFormat + ".zip";
|
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
|
try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
|
|
for (File outputFile : outputFiles) {
|
|
ZipEntry entry = new ZipEntry(outputFile.getName());
|
|
zipOutputStream.putNextEntry(entry);
|
|
try (FileInputStream fis = new FileInputStream(outputFile)) {
|
|
IOUtils.copy(fis, zipOutputStream);
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip entry", e);
|
|
}
|
|
|
|
zipOutputStream.closeEntry();
|
|
}
|
|
} catch (IOException e) {
|
|
log.error("Exception writing zip", e);
|
|
}
|
|
|
|
fileBytes = byteArrayOutputStream.toByteArray();
|
|
}
|
|
|
|
} finally {
|
|
// Clean up the temporary files
|
|
Files.deleteIfExists(tempInputFile);
|
|
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
|
}
|
|
return WebResponseUtils.bytesToWebResponse(
|
|
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
|
}
|
|
}
|