Stirling-PDF/src/main/java/stirling/software/SPDF/utils/PDFToFile.java

package stirling.software.SPDF.utils;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.multipart.MultipartFile;

import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;
import com.vladsch.flexmark.util.data.MutableDataSet;

import io.github.pixee.security.Filenames;

import lombok.extern.slf4j.Slf4j;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;

@Slf4j
public class PDFToFile {

    public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)
            throws IOException, InterruptedException {
        if (!"application/pdf".equals(inputFile.getContentType())) {
            return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
        }

        MutableDataSet options =
                new MutableDataSet()
                        .set(
                                FlexmarkHtmlConverter.MAX_BLANK_LINES,
                                2) // Control max consecutive blank lines
                        .set(
                                FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,
                                1) // Control trailing blank lines
                        .set(
                                FlexmarkHtmlConverter.SETEXT_HEADINGS,
                                true) // Use Setext headings for h1 and h2
                        .set(
                                FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,
                                false) // Don't output HTML for unknown tags
                        .set(
                                FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,
                                true) // Convert quotation marks
                        .set(
                                FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,
                                true) // Convert <br> to paragraph breaks
                        .set(FlexmarkHtmlConverter.CODE_INDENT, "    "); // Indent for code blocks

        FlexmarkHtmlConverter htmlToMarkdownConverter =
                FlexmarkHtmlConverter.builder(options).build();

        String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
        String pdfBaseName = originalPdfFileName;
        if (originalPdfFileName.contains(".")) {
            pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
        }

        Path tempInputFile = null;
        Path tempOutputDir = null;
        byte[] fileBytes;
        String fileName = "temp.file";

        try {
            tempInputFile = Files.createTempFile("input_", ".pdf");
            inputFile.transferTo(tempInputFile);
            tempOutputDir = Files.createTempDirectory("output_");

            List<String> command =
                    new ArrayList<>(
                            Arrays.asList(
                                    "pdftohtml",
                                    "-s",
                                    "-noframes",
                                    "-c",
                                    tempInputFile.toString(),
                                    pdfBaseName));

            ProcessExecutorResult returnCode =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
                            .runCommandWithOutputHandling(command, tempOutputDir.toFile());
            // Process HTML files to Markdown
            File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());
            List<File> markdownFiles = new ArrayList<>();

            // Convert HTML files to Markdown
            for (File outputFile : outputFiles) {
                if (outputFile.getName().endsWith(".html")) {
                    String html = Files.readString(outputFile.toPath());
                    String markdown = htmlToMarkdownConverter.convert(html);

                    String mdFileName = outputFile.getName().replace(".html", ".md");
                    File mdFile = new File(tempOutputDir.toFile(), mdFileName);
                    Files.writeString(mdFile.toPath(), markdown);
                    markdownFiles.add(mdFile);
                }
            }

            // If there's only one markdown file, return it directly
            if (markdownFiles.size() == 1) {
                fileName = pdfBaseName + ".md";
                fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());
            } else {
                // Multiple files - create a zip
                fileName = pdfBaseName + "ToMarkdown.zip";
                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();

                try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
                    // Add markdown files
                    for (File mdFile : markdownFiles) {
                        ZipEntry mdEntry = new ZipEntry(mdFile.getName());
                        zipOutputStream.putNextEntry(mdEntry);
                        Files.copy(mdFile.toPath(), zipOutputStream);
                        zipOutputStream.closeEntry();
                    }

                    // Add images and other assets
                    for (File file : outputFiles) {
                        if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {
                            ZipEntry assetEntry = new ZipEntry(file.getName());
                            zipOutputStream.putNextEntry(assetEntry);
                            Files.copy(file.toPath(), zipOutputStream);
                            zipOutputStream.closeEntry();
                        }
                    }
                }

                fileBytes = byteArrayOutputStream.toByteArray();
            }

        } finally {
            if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
            if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
        }
        return WebResponseUtils.bytesToWebResponse(
                fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
    }

    public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)
            throws IOException, InterruptedException {
        if (!"application/pdf".equals(inputFile.getContentType())) {
            return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
        }

        // Get the original PDF file name without the extension
        String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
        String pdfBaseName = originalPdfFileName;
        if (originalPdfFileName.contains(".")) {
            pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
        }

        Path tempInputFile = null;
        Path tempOutputDir = null;
        byte[] fileBytes;
        String fileName = "temp.file";

        try {
            // Save the uploaded file to a temporary location
            tempInputFile = Files.createTempFile("input_", ".pdf");
            inputFile.transferTo(tempInputFile);

            // Prepare the output directory
            tempOutputDir = Files.createTempDirectory("output_");

            // Run the pdftohtml command with complex output
            List<String> command =
                    new ArrayList<>(
                            Arrays.asList(
                                    "pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));

            ProcessExecutorResult returnCode =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)
                            .runCommandWithOutputHandling(command, tempOutputDir.toFile());

            // Get output files
            File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());

            // Return output files in a ZIP archive
            fileName = pdfBaseName + "ToHtml.zip";
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
                for (File outputFile : outputFiles) {
                    ZipEntry entry = new ZipEntry(outputFile.getName());
                    zipOutputStream.putNextEntry(entry);
                    try (FileInputStream fis = new FileInputStream(outputFile)) {
                        IOUtils.copy(fis, zipOutputStream);
                    } catch (IOException e) {
                        log.error("Exception writing zip entry", e);
                    }
                    zipOutputStream.closeEntry();
                }
            } catch (IOException e) {
                log.error("Exception writing zip", e);
            }
            fileBytes = byteArrayOutputStream.toByteArray();

        } finally {
            // Clean up the temporary files
            if (tempInputFile != null) Files.deleteIfExists(tempInputFile);
            if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
        }

        return WebResponseUtils.bytesToWebResponse(
                fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
    }

    public ResponseEntity<byte[]> processPdfToOfficeFormat(
            MultipartFile inputFile, String outputFormat, String libreOfficeFilter)
            throws IOException, InterruptedException {

        if (!"application/pdf".equals(inputFile.getContentType())) {
            return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
        }

        // Get the original PDF file name without the extension
        String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());

        if (originalPdfFileName == null || "".equals(originalPdfFileName.trim())) {
            originalPdfFileName = "output.pdf";
        }
        // Assume file is pdf if no extension
        String pdfBaseName = originalPdfFileName;
        if (originalPdfFileName.contains(".")) {
            pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
        }
        // Validate output format
        List<String> allowedFormats =
                Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
        if (!allowedFormats.contains(outputFormat)) {
            return new ResponseEntity<>(HttpStatus.BAD_REQUEST);
        }

        Path tempInputFile = null;
        Path tempOutputDir = null;
        byte[] fileBytes;
        String fileName = "temp.file";

        try {
            // Save the uploaded file to a temporary location
            tempInputFile = Files.createTempFile("input_", ".pdf");
            inputFile.transferTo(tempInputFile);

            // Prepare the output directory
            tempOutputDir = Files.createTempDirectory("output_");

            // Run the LibreOffice command
            List<String> command =
                    new ArrayList<>(
                            Arrays.asList(
                                    "soffice",
                                    "--headless",
                                    "--nologo",
                                    "--infilter=" + libreOfficeFilter,
                                    "--convert-to",
                                    outputFormat,
                                    "--outdir",
                                    tempOutputDir.toString(),
                                    tempInputFile.toString()));
            ProcessExecutorResult returnCode =
                    ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)
                            .runCommandWithOutputHandling(command);

            // Get output files
            List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());

            if (outputFiles.size() == 1) {
                // Return single output file
                File outputFile = outputFiles.get(0);
                if ("txt:Text".equals(outputFormat)) {
                    outputFormat = "txt";
                }
                fileName = pdfBaseName + "." + outputFormat;
                fileBytes = FileUtils.readFileToByteArray(outputFile);
            } else {
                // Return output files in a ZIP archive
                fileName = pdfBaseName + "To" + outputFormat + ".zip";
                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {
                    for (File outputFile : outputFiles) {
                        ZipEntry entry = new ZipEntry(outputFile.getName());
                        zipOutputStream.putNextEntry(entry);
                        try (FileInputStream fis = new FileInputStream(outputFile)) {
                            IOUtils.copy(fis, zipOutputStream);
                        } catch (IOException e) {
                            log.error("Exception writing zip entry", e);
                        }

                        zipOutputStream.closeEntry();
                    }
                } catch (IOException e) {
                    log.error("Exception writing zip", e);
                }

                fileBytes = byteArrayOutputStream.toByteArray();
            }

        } finally {
            // Clean up the temporary files
            Files.deleteIfExists(tempInputFile);
            if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
        }
        return WebResponseUtils.bytesToWebResponse(
                fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
    }
}
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`package stirling.software.SPDF.utils;`
format and move everything, other in own folder 2023-04-22 12:51:01 +01:00
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`import java.io.ByteArrayOutputStream;`
			`import java.io.File;`
			`import java.io.FileInputStream;`
			`import java.io.IOException;`
			`import java.nio.file.Files;`
			`import java.nio.file.Path;`
			`import java.util.ArrayList;`
			`import java.util.Arrays;`
			`import java.util.List;`
File paths dynamic (#2605) # Description Please provide a summary of the changes, including relevant motivation and context. Closes #(issue_number) ## Checklist - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have performed a self-review of my own code - [ ] I have attached images of the change if it is UI based - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] If my code has heavily changed functionality I have updated relevant docs on [Stirling-PDFs doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) - [ ] My changes generate no new warnings - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> Co-authored-by: a <a> 2025-01-06 12:41:30 +00:00			`import java.util.Objects;`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`import java.util.zip.ZipEntry;`
			`import java.util.zip.ZipOutputStream;`

			`import org.apache.commons.io.FileUtils;`
			`import org.apache.commons.io.IOUtils;`
			`import org.springframework.http.HttpStatus;`
			`import org.springframework.http.MediaType;`
			`import org.springframework.http.ResponseEntity;`
			`import org.springframework.web.multipart.MultipartFile;`
format and move everything, other in own folder 2023-04-22 12:51:01 +01:00
Pdf to markdown (#2730) # Description Please provide a summary of the changes, including relevant motivation and context. Closes #(issue_number) ## Checklist - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have performed a self-review of my own code - [ ] I have attached images of the change if it is UI based - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] If my code has heavily changed functionality I have updated relevant docs on [Stirling-PDFs doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) - [ ] My changes generate no new warnings - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) --------- Co-authored-by: a <a> 2025-01-17 22:18:55 +00:00			`import com.vladsch.flexmark.html2md.converter.FlexmarkHtmlConverter;`
			`import com.vladsch.flexmark.util.data.MutableDataSet;`

fix: use the same margins for x and y in the stamp feature 2024-02-07 21:40:33 -05:00			`import io.github.pixee.security.Filenames;`
format 2024-06-02 12:02:01 +01:00
Remove Direct Logger and Use Lombok `@Slf4j` 2024-12-17 10:26:18 +01:00			`import lombok.extern.slf4j.Slf4j;`
changes to script executor and init 2023-07-29 13:53:30 +01:00			`import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;`

Remove Direct Logger and Use Lombok `@Slf4j` 2024-12-17 10:26:18 +01:00			`@Slf4j`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`public class PDFToFile {`
fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00
Pdf to markdown (#2730) # Description Please provide a summary of the changes, including relevant motivation and context. Closes #(issue_number) ## Checklist - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have performed a self-review of my own code - [ ] I have attached images of the change if it is UI based - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] If my code has heavily changed functionality I have updated relevant docs on [Stirling-PDFs doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) - [ ] My changes generate no new warnings - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) --------- Co-authored-by: a <a> 2025-01-17 22:18:55 +00:00			`public ResponseEntity<byte[]> processPdfToMarkdown(MultipartFile inputFile)`
			`throws IOException, InterruptedException {`
			`if (!"application/pdf".equals(inputFile.getContentType())) {`
			`return new ResponseEntity<>(HttpStatus.BAD_REQUEST);`
			`}`

			`MutableDataSet options =`
			`new MutableDataSet()`
			`.set(`
			`FlexmarkHtmlConverter.MAX_BLANK_LINES,`
			`2) // Control max consecutive blank lines`
			`.set(`
			`FlexmarkHtmlConverter.MAX_TRAILING_BLANK_LINES,`
			`1) // Control trailing blank lines`
			`.set(`
			`FlexmarkHtmlConverter.SETEXT_HEADINGS,`
			`true) // Use Setext headings for h1 and h2`
			`.set(`
			`FlexmarkHtmlConverter.OUTPUT_UNKNOWN_TAGS,`
			`false) // Don't output HTML for unknown tags`
			`.set(`
			`FlexmarkHtmlConverter.TYPOGRAPHIC_QUOTES,`
			`true) // Convert quotation marks`
			`.set(`
			`FlexmarkHtmlConverter.BR_AS_PARA_BREAKS,`
			`true) // Convert <br> to paragraph breaks`
			`.set(FlexmarkHtmlConverter.CODE_INDENT, " "); // Indent for code blocks`

			`FlexmarkHtmlConverter htmlToMarkdownConverter =`
			`FlexmarkHtmlConverter.builder(options).build();`

			`String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());`
			`String pdfBaseName = originalPdfFileName;`
			`if (originalPdfFileName.contains(".")) {`
			`pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));`
			`}`

			`Path tempInputFile = null;`
			`Path tempOutputDir = null;`
			`byte[] fileBytes;`
			`String fileName = "temp.file";`

			`try {`
			`tempInputFile = Files.createTempFile("input_", ".pdf");`
			`inputFile.transferTo(tempInputFile);`
			`tempOutputDir = Files.createTempDirectory("output_");`

			`List<String> command =`
			`new ArrayList<>(`
			`Arrays.asList(`
			`"pdftohtml",`
			`"-s",`
			`"-noframes",`
			`"-c",`
			`tempInputFile.toString(),`
			`pdfBaseName));`

			`ProcessExecutorResult returnCode =`
			`ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)`
			`.runCommandWithOutputHandling(command, tempOutputDir.toFile());`
			`// Process HTML files to Markdown`
			`File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());`
			`List<File> markdownFiles = new ArrayList<>();`

			`// Convert HTML files to Markdown`
			`for (File outputFile : outputFiles) {`
			`if (outputFile.getName().endsWith(".html")) {`
			`String html = Files.readString(outputFile.toPath());`
			`String markdown = htmlToMarkdownConverter.convert(html);`

			`String mdFileName = outputFile.getName().replace(".html", ".md");`
			`File mdFile = new File(tempOutputDir.toFile(), mdFileName);`
			`Files.writeString(mdFile.toPath(), markdown);`
			`markdownFiles.add(mdFile);`
			`}`
			`}`

			`// If there's only one markdown file, return it directly`
			`if (markdownFiles.size() == 1) {`
			`fileName = pdfBaseName + ".md";`
			`fileBytes = Files.readAllBytes(markdownFiles.get(0).toPath());`
			`} else {`
			`// Multiple files - create a zip`
			`fileName = pdfBaseName + "ToMarkdown.zip";`
			`ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();`

			`try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {`
			`// Add markdown files`
			`for (File mdFile : markdownFiles) {`
			`ZipEntry mdEntry = new ZipEntry(mdFile.getName());`
			`zipOutputStream.putNextEntry(mdEntry);`
			`Files.copy(mdFile.toPath(), zipOutputStream);`
			`zipOutputStream.closeEntry();`
			`}`

			`// Add images and other assets`
			`for (File file : outputFiles) {`
			`if (!file.getName().endsWith(".html") && !file.getName().endsWith(".md")) {`
			`ZipEntry assetEntry = new ZipEntry(file.getName());`
			`zipOutputStream.putNextEntry(assetEntry);`
			`Files.copy(file.toPath(), zipOutputStream);`
			`zipOutputStream.closeEntry();`
			`}`
			`}`
			`}`

			`fileBytes = byteArrayOutputStream.toByteArray();`
			`}`

			`} finally {`
			`if (tempInputFile != null) Files.deleteIfExists(tempInputFile);`
			`if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());`
			`}`
			`return WebResponseUtils.bytesToWebResponse(`
			`fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);`
			`}`

fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00			`public ResponseEntity<byte[]> processPdfToHtml(MultipartFile inputFile)`
			`throws IOException, InterruptedException {`
			`if (!"application/pdf".equals(inputFile.getContentType())) {`
			`return new ResponseEntity<>(HttpStatus.BAD_REQUEST);`
			`}`

			`// Get the original PDF file name without the extension`
			`String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());`
changes 2024-05-26 15:31:34 +01:00			`String pdfBaseName = originalPdfFileName;`
			`if (originalPdfFileName.contains(".")) {`
			`pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));`
			`}`
fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00
			`Path tempInputFile = null;`
			`Path tempOutputDir = null;`
			`byte[] fileBytes;`
			`String fileName = "temp.file";`

			`try {`
			`// Save the uploaded file to a temporary location`
			`tempInputFile = Files.createTempFile("input_", ".pdf");`
deletion changes 2024-05-27 17:53:18 +01:00			`inputFile.transferTo(tempInputFile);`
fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00
			`// Prepare the output directory`
			`tempOutputDir = Files.createTempDirectory("output_");`

			`// Run the pdftohtml command with complex output`
			`List<String> command =`
			`new ArrayList<>(`
			`Arrays.asList(`
			`"pdftohtml", "-c", tempInputFile.toString(), pdfBaseName));`

			`ProcessExecutorResult returnCode =`
			`ProcessExecutor.getInstance(ProcessExecutor.Processes.PDFTOHTML)`
			`.runCommandWithOutputHandling(command, tempOutputDir.toFile());`

			`// Get output files`
File paths dynamic (#2605) # Description Please provide a summary of the changes, including relevant motivation and context. Closes #(issue_number) ## Checklist - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have performed a self-review of my own code - [ ] I have attached images of the change if it is UI based - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] If my code has heavily changed functionality I have updated relevant docs on [Stirling-PDFs doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) - [ ] My changes generate no new warnings - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only) --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> Co-authored-by: a <a> 2025-01-06 12:41:30 +00:00			`File[] outputFiles = Objects.requireNonNull(tempOutputDir.toFile().listFiles());`
fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00
			`// Return output files in a ZIP archive`
			`fileName = pdfBaseName + "ToHtml.zip";`
			`ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();`
logging and try catch 2024-06-02 11:59:43 +01:00			`try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {`
			`for (File outputFile : outputFiles) {`
			`ZipEntry entry = new ZipEntry(outputFile.getName());`
			`zipOutputStream.putNextEntry(entry);`
			`try (FileInputStream fis = new FileInputStream(outputFile)) {`
			`IOUtils.copy(fis, zipOutputStream);`
			`} catch (IOException e) {`
Remove Direct Logger and Use Lombok `@Slf4j` 2024-12-17 10:26:18 +01:00			`log.error("Exception writing zip entry", e);`
logging and try catch 2024-06-02 11:59:43 +01:00			`}`
			`zipOutputStream.closeEntry();`
			`}`
			`} catch (IOException e) {`
Remove Direct Logger and Use Lombok `@Slf4j` 2024-12-17 10:26:18 +01:00			`log.error("Exception writing zip", e);`
fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00			`}`
			`fileBytes = byteArrayOutputStream.toByteArray();`

			`} finally {`
			`// Clean up the temporary files`
deletion changes 2024-05-27 17:53:18 +01:00			`if (tempInputFile != null) Files.deleteIfExists(tempInputFile);`
fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00			`if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());`
			`}`

			`return WebResponseUtils.bytesToWebResponse(`
			`fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);`
			`}`

format and move everything, other in own folder 2023-04-22 12:51:01 +01:00			`public ResponseEntity<byte[]> processPdfToOfficeFormat(`
			`MultipartFile inputFile, String outputFormat, String libreOfficeFilter)`
			`throws IOException, InterruptedException {`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00
			`if (!"application/pdf".equals(inputFile.getContentType())) {`
			`return new ResponseEntity<>(HttpStatus.BAD_REQUEST);`
			`}`

			`// Get the original PDF file name without the extension`
Sanitized user-provided file names in HTTP multipart uploads 2024-02-01 23:48:27 +00:00			`String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00
changes 2024-05-26 15:31:34 +01:00			`if (originalPdfFileName == null \|\| "".equals(originalPdfFileName.trim())) {`
			`originalPdfFileName = "output.pdf";`
			`}`
			`// Assume file is pdf if no extension`
			`String pdfBaseName = originalPdfFileName;`
			`if (originalPdfFileName.contains(".")) {`
			`pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));`
			`}`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`// Validate output format`
format and move everything, other in own folder 2023-04-22 12:51:01 +01:00			`List<String> allowedFormats =`
fix: switch to pdftohtml for pdf to html conversions (#998) * fix: switch to pdftohtml for pdf to html conversions * build: include poppler-utils in dockerfile for pdftohtml 2024-03-29 17:02:33 -04:00			`Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`if (!allowedFormats.contains(outputFormat)) {`
			`return new ResponseEntity<>(HttpStatus.BAD_REQUEST);`
			`}`
format and move everything, other in own folder 2023-04-22 12:51:01 +01:00
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`Path tempInputFile = null;`
			`Path tempOutputDir = null;`
			`byte[] fileBytes;`
utf8 bug fix and scan pages (#113) 2023-05-01 21:57:48 +01:00			`String fileName = "temp.file";`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00
			`try {`
			`// Save the uploaded file to a temporary location`
			`tempInputFile = Files.createTempFile("input_", ".pdf");`
deletion changes 2024-05-27 17:53:18 +01:00			`inputFile.transferTo(tempInputFile);`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00
			`// Prepare the output directory`
			`tempOutputDir = Files.createTempDirectory("output_");`

			`// Run the LibreOffice command`
format and move everything, other in own folder 2023-04-22 12:51:01 +01:00			`List<String> command =`
			`new ArrayList<>(`
			`Arrays.asList(`
			`"soffice",`
When converting PDF to word, add parameters to speed up soffice startup (#1450) When converting PDF to word, add parameters to speed up soffice startup 2024-06-14 01:13:38 +08:00			`"--headless",`
			`"--nologo",`
format and move everything, other in own folder 2023-04-22 12:51:01 +01:00			`"--infilter=" + libreOfficeFilter,`
			`"--convert-to",`
			`outputFormat,`
			`"--outdir",`
			`tempOutputDir.toString(),`
			`tempInputFile.toString()));`
changes to script executor and init 2023-07-29 13:53:30 +01:00			`ProcessExecutorResult returnCode =`
			`ProcessExecutor.getInstance(ProcessExecutor.Processes.LIBRE_OFFICE)`
			`.runCommandWithOutputHandling(command);`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00
			`// Get output files`
			`List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());`

			`if (outputFiles.size() == 1) {`
			`// Return single output file`
			`File outputFile = outputFiles.get(0);`
Switch order of literals to prevent NullPointerException 2024-02-02 00:29:18 +00:00			`if ("txt:Text".equals(outputFormat)) {`
format and move everything, other in own folder 2023-04-22 12:51:01 +01:00			`outputFormat = "txt";`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`}`
utf8 bug fix and scan pages (#113) 2023-05-01 21:57:48 +01:00			`fileName = pdfBaseName + "." + outputFormat;`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`fileBytes = FileUtils.readFileToByteArray(outputFile);`
			`} else {`
			`// Return output files in a ZIP archive`
format 2024-06-02 12:02:01 +01:00			`fileName = pdfBaseName + "To" + outputFormat + ".zip";`
			`ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();`
			`try (ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream)) {`
			`for (File outputFile : outputFiles) {`
			`ZipEntry entry = new ZipEntry(outputFile.getName());`
			`zipOutputStream.putNextEntry(entry);`
			`try (FileInputStream fis = new FileInputStream(outputFile)) {`
			`IOUtils.copy(fis, zipOutputStream);`
			`} catch (IOException e) {`
Remove Direct Logger and Use Lombok `@Slf4j` 2024-12-17 10:26:18 +01:00			`log.error("Exception writing zip entry", e);`
format 2024-06-02 12:02:01 +01:00			`}`

			`zipOutputStream.closeEntry();`
			`}`
			`} catch (IOException e) {`
Remove Direct Logger and Use Lombok `@Slf4j` 2024-12-17 10:26:18 +01:00			`log.error("Exception writing zip", e);`
format 2024-06-02 12:02:01 +01:00			`}`

			`fileBytes = byteArrayOutputStream.toByteArray();`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`}`

			`} finally {`
			`// Clean up the temporary files`
deletion changes 2024-05-27 17:53:18 +01:00			`Files.deleteIfExists(tempInputFile);`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());`
			`}`
util move around 2023-05-31 20:15:48 +01:00			`return WebResponseUtils.bytesToWebResponse(`
			`fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);`
Convert PDF to Docx, powerpoint and others (#90) 2023-04-16 22:03:30 +01:00			`}`
			`}`