LuteOrg · jzohrab · Jan 10, 2024 · Jan 7, 2024 · Jan 8, 2024 · Jan 8, 2024
diff --git a/lute/book/forms.py b/lute/book/forms.py
@@ -27,8 +27,8 @@ class NewBookForm(FlaskForm):
  "Text file",
  validators=[
  FileAllowed(
- ["txt", "epub"],
- "Please upload a valid .txt or .epub file.",
+ ["txt", "epub", "pdf"],
+ "Please upload a valid '.txt', '.epub' or '.pdf' file.",
  )
  ],
  )

diff --git a/lute/book/routes.py b/lute/book/routes.py
@@ -62,6 +62,8 @@ def _get_file_content(filefielddata):
  return service.get_textfile_content(filefielddata)
  if ext == ".epub":
  return service.get_epub_content(filefielddata)
+ if ext == ".pdf":
+ return service.get_pdf_content_from_form(filefielddata)
  raise ValueError(f'Unknown file extension "{ext}"')
 
 

diff --git a/lute/book/service.py b/lute/book/service.py
@@ -11,6 +11,7 @@
 from bs4 import BeautifulSoup
 from flask import current_app, flash
 from openepub import Epub, EpubError
+from pypdf import PdfReader
 from werkzeug.utils import secure_filename
 from lute.book.model import Book
 
@@ -82,6 +83,21 @@ def get_epub_content(epub_file_field_data):
  return content
 
 
+def get_pdf_content_from_form(pdf_file_field_data):
+ "Get content as a single string from a PDF file using PyPDF2."
+ content = ""
+ try:
+ pdf_reader = PdfReader(pdf_file_field_data)
+
+ for page in pdf_reader.pages:
+ content += page.extract_text()
+
+ return content
+ except Exception as e:
+ msg = f"Could not parse {pdf_file_field_data.filename} (error: {str(e)})"
+ raise BookImportException(message=msg, cause=e) from e
+
+
 def book_from_url(url):
  "Parse the url and load a new Book."
  s = None

diff --git a/lute/templates/book/create_new.html b/lute/templates/book/create_new.html
@@ -33,7 +33,7 @@
  </tr>
 
  <tr>
- <td>{{ form.textfile.label }} <i>(.txt, .epub)</i></td>
+ <td>{{ form.textfile.label }} <i>(.txt, .epub, .pdf)</i></td>
  <td>{{ form.textfile() }}</td>
  </tr>
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,8 @@ dependencies = [
  "PyYAML>=6.0.1,<7",
  "toml>=0.10.2,<1",
  "waitress>=2.1.2,<3",
- "openepub>=0.0.6,<1"
+ "openepub>=0.0.6,<1",
+ "pypdf>=3.17.4"
 ]
 
 [project.scripts]

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-astroid==2.15.6
+astroid==2.15.6
 attrs==23.1.0
 beautifulsoup4==4.12.2
 black==23.10.1
@@ -8,6 +8,7 @@ cffi==1.16.0
 cfgv==3.4.0
 charset-normalizer==3.3.1
 click==8.1.7
+colorama==0.4.6
 coverage==7.3.1
 dill==0.3.7
 distlib==0.3.7
@@ -37,6 +38,7 @@ mccabe==0.7.0
 mypy-extensions==1.0.0
 natto-py==1.0.1
 nodeenv==1.8.0
+openepub==0.0.6
 outcome==1.3.0.post0
 packaging==23.1
 parse==1.19.1
@@ -50,6 +52,7 @@ pre-commit==3.5.0
 pycparser==2.21
 pyee==11.0.1
 pylint==2.17.5
+pypdf==3.17.4
 PySocks==1.7.1
 pytest==7.4.2
 pytest-base-url==2.0.0
@@ -81,5 +84,5 @@ Werkzeug==2.3.7
 wrapt==1.15.0
 wsproto==1.2.0
 WTForms==3.0.1
+xmltodict==0.13.0
 zipp==3.17.0
-openepub==0.0.6
diff --git a/tests/acceptance/book.feature b/tests/acceptance/book.feature
@@ -43,6 +43,18 @@ Feature: Books and stats are available
  Given a Spanish book "Hola" from file invalid.epub
  Then the page contains "Could not parse invalid.epub"
 
+ Scenario: I can import a PDF file.
+ Given I visit "/"
+ Given a Spanish book "Hola" from file Hola.pdf
+ Then the page title is Reading "Hola"
+ And the reading pane shows:
+ Tengo/ /un/ /amigo/.
+
+ Scenario: Invalid PDF files are rejected.
+ Given I visit "/"
+ Given a Spanish book "Hola" from file invalid.pdf
+ Then the page contains "Could not parse invalid.pdf"
+
  Scenario: Books and stats are shown on the first page.
  Given I visit "/"
  Given a Spanish book "Hola" with content:

diff --git a/tests/acceptance/sample_files/Hola.pdf b/tests/acceptance/sample_files/Hola.pdf
diff --git a/tests/acceptance/sample_files/invalid.pdf b/tests/acceptance/sample_files/invalid.pdf