Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOCX-PDF Conversion #40

Merged
merged 14 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.21'
go-version: "1.21"
- name: Install libreoffice
run: |
sudo apt-get update
sudo apt-get -y install libreoffice
- name: Install dependencies
run: go get .
- name: Test
Expand Down
8 changes: 7 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@ COPY . .
RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o morphos .

# Deploy the application binary into a lean image
FROM debian:bookworm-slim AS release
FROM debian:trixie-slim AS release

WORKDIR /

RUN apt-get update \
&& apt-get install -y --no-install-recommends default-jre libreoffice libreoffice-java-common \
&& apt-get autoremove -y \
&& apt-get purge -y --auto-remove \
&& rm -rf /var/lib/apt/lists/*

COPY --from=builder /app/morphos /bin/morphos
COPY --from=builder /usr/share/fonts /usr/share/fonts

Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,16 @@ A modal will pop up with a preview of the converted image.

## Documents X Images

| | PNG | JPEG | GIF | WEBP | TIFF | BMP |
|-------|-------|--------|-------|--------|--------|-------|
| PDF | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| | PNG | JPEG | GIF | WEBP | TIFF | BMP |
| --- | --- | ---- | --- | ---- | ---- | --- |
| PDF | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |

## Documents X Documents

| | DOCX | PDF |
| ---- | ---- | --- |
| PDF | ✅ | |
| DOCX | | ✅ |

## License
The MIT License (MIT). See [LICENSE](LICENSE) file for more details.
4 changes: 2 additions & 2 deletions pkg/files/document_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ func (d *DocumentFactory) NewFile(f string) (File, error) {
switch f {
case documents.PDF:
return documents.NewPdf(d.filename), nil
case documents.DOCX:
return new(documents.Docx), nil
case documents.DOCX, documents.DOCXMIMEType:
return documents.NewDocx(d.filename), nil
default:
return nil, fmt.Errorf("type file file %s not recognized", f)
}
Expand Down
5 changes: 3 additions & 2 deletions pkg/files/documents/documents.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package documents

const (
DOCX = "docx"
PDF = "pdf"
DOCX = "docx"
DOCXMIMEType = "vnd.openxmlformats-officedocument.wordprocessingml.document"
PDF = "pdf"

imageMimeType = "image/"
imageType = "image"
Expand Down
73 changes: 72 additions & 1 deletion pkg/files/documents/documents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ type documenter interface {
DocumentType() string
}

func TestPDFToImages(t *testing.T) {
func TestPDFTConvertTo(t *testing.T) {
type input struct {
filename string
mimetype string
Expand Down Expand Up @@ -88,6 +88,77 @@ func TestPDFToImages(t *testing.T) {
mimetype: "application/zip",
},
},
{
name: "pdf to docx",
input: input{
filename: "testdata/bitcoin.pdf",
mimetype: "application/pdf",
targetFileType: "Document",
targetFormat: "docx",
documenter: documents.NewPdf("bitcoin.pdf"),
},
expected: expected{
mimetype: "application/zip",
},
},
}

for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

inputDoc, err := os.ReadFile(tc.input.filename)
require.NoError(t, err)

detectedFileType := mimetype.Detect(inputDoc)
require.Equal(t, tc.input.mimetype, detectedFileType.String())

outoutFile, err := tc.input.documenter.ConvertTo(
tc.input.targetFileType,
tc.input.targetFormat,
inputDoc,
)

require.NoError(t, err)

detectedFileType = mimetype.Detect(outoutFile)
require.Equal(t, tc.expected.mimetype, detectedFileType.String())
})
}
}

func TestDOCXTConvertTo(t *testing.T) {

type input struct {
filename string
mimetype string
targetFileType string
targetFormat string
documenter documenter
}
type expected struct {
mimetype string
}
var tests = []struct {
name string
input input
expected expected
}{
{

name: "docx to pdf",
input: input{
filename: "testdata/file_sample.docx",
mimetype: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
targetFileType: "Document",
targetFormat: "pdf",
documenter: documents.NewDocx("file_sample.docx"),
},
expected: expected{
mimetype: "application/zip",
},
},
}

for _, tc := range tests {
Expand Down
188 changes: 182 additions & 6 deletions pkg/files/documents/docx.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,193 @@
package documents

import "errors"
import (
"archive/zip"
"bytes"
"errors"
"fmt"
"io"
"log"
"os"
"os/exec"
"path/filepath"
"slices"
"strings"
)

type Docx struct{}
// Docx struct implements the File and Document interface from the file package.
type Docx struct {
filename string
compatibleFormats map[string][]string
compatibleMIMETypes map[string][]string
OutDir string
}

// NewDocx returns a pointer to Docx.
func NewDocx(filename string) *Docx {
d := Docx{
filename: filename,
compatibleFormats: map[string][]string{
"Document": {
PDF,
},
},
compatibleMIMETypes: map[string][]string{
"Document": {
PDF,
},
},
}

return &d
}

func (p *Docx) SupportedFormats() map[string][]string {
return make(map[string][]string)
// SupportedFormats returns a map witht the compatible formats that Docx is
// compatible to be converted to.
func (d *Docx) SupportedFormats() map[string][]string {
return d.compatibleFormats
}

func (p *Docx) ConvertTo(fileType, subType string, fileBytes []byte) ([]byte, error) {
// SupportedMIMETypes returns a map witht the compatible MIME types that Docx is
// compatible to be converted to.
func (d *Docx) SupportedMIMETypes() map[string][]string {
return d.compatibleMIMETypes
}

func (d *Docx) ConvertTo(fileType, subType string, fileBytes []byte) ([]byte, error) {
compatibleFormats, ok := d.SupportedFormats()[fileType]
if !ok {
return nil, fmt.Errorf("file type not supported: %s", fileType)
}

if !slices.Contains(compatibleFormats, subType) {
return nil, fmt.Errorf("sub-type not supported: %s", subType)
}

switch strings.ToLower(fileType) {
case documentType:
switch subType {
case PDF:
var (
stdout bytes.Buffer
stderr bytes.Buffer
)

docxFilename := filepath.Join("/tmp", d.filename)
pdfFileName := fmt.Sprintf(
"%s.pdf",
strings.TrimSuffix(d.filename, filepath.Ext(d.filename)),
)
tmpPdfFileName := filepath.Join("/tmp", fmt.Sprintf(
"%s.pdf",
strings.TrimSuffix(d.filename, filepath.Ext(d.filename)),
))

// Parses the file name of the Zip file.
zipFileName := filepath.Join("/tmp", fmt.Sprintf(
"%s.zip",
strings.TrimSuffix(d.filename, filepath.Ext(d.filename)),
))

docxFile, err := os.Create(docxFilename)
if err != nil {
return nil, fmt.Errorf(
"error creating file to store the incoming docx locally %s: %w",
d.filename,
err,
)
}
defer docxFile.Close()

if _, err := docxFile.Write(fileBytes); err != nil {
return nil, fmt.Errorf(
"error storing the incoming pdf file %s: %w",
d.filename,
err,
)
}

tmpPdfFile, err := os.Create(tmpPdfFileName)
if err != nil {
return nil, fmt.Errorf(
"error at creating the pdf file to store the pdf content: %w",
err,
)
}

cmdStr := "libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir %s %s"
cmd := exec.Command(
"bash",
"-c",
fmt.Sprintf(cmdStr, "/tmp", docxFilename),
)

cmd.Stdout = &stdout
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
return nil, fmt.Errorf(
"error converting docx to pdf using libreoffice: %s",
stderr.String(),
)
}

log.Println(stdout.String())

tmpPdfFile.Close()

tmpPdfFile, err = os.Open(tmpPdfFileName)
if err != nil {
return nil, fmt.Errorf(
"error at opening the pdf file: %w",
err,
)
}
defer tmpPdfFile.Close()

// Creates the zip file that will be returned.
archive, err := os.Create(zipFileName)
if err != nil {
return nil, fmt.Errorf(
"error at creating the zip file to store the pdf file: %w",
err,
)
}

// Creates a Zip Writer to add files later on.
zipWriter := zip.NewWriter(archive)

w1, err := zipWriter.Create(pdfFileName)
if err != nil {
return nil, fmt.Errorf(
"eror at creating a zip file: %w",
err,
)
}

if _, err := io.Copy(w1, tmpPdfFile); err != nil {
return nil, fmt.Errorf(
"error at writing the pdf file content to the zip writer: %w",
err,
)
}

// Closes both zip writer and the zip file after its done with the writing.
zipWriter.Close()
archive.Close()

// Reads the zip file as an slice of bytes.
zipFile, err := os.ReadFile(zipFileName)
if err != nil {
return nil, fmt.Errorf("error reading zip file: %v", err)
}

return zipFile, nil
}
}

return nil, errors.New("not implemented")
}

func (p *Docx) DocumentType() string {
func (d *Docx) DocumentType() string {
return DOCX
}
Loading
Loading