Skip to content

Made it possible to pass multiple files to pandoc - fixes #248 #259

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,23 @@ output = pypandoc.convert_file('somefile.md', 'docx', outputfile="somefile.docx"
assert output == ""
```


It's also possible to specify multiple input files to pandoc, either as absolute paths, relative paths or file patterns.

```python
import pypandoc

# convert all markdown files in a chapters/ subdirectory.
pypandoc.convert_file('chapters/*.md', 'docx', outputfile="somefile.docx")

# convert all markdown files in the book1 and book2 directories.
pypandoc.convert_file(['book1/*.md', 'book2/*.md'], 'docx', outputfile="somefile.docx")

# convert the front from another drive, and all markdown files in the chapter directory.
pypandoc.convert_file(['D:/book_front.md', 'book2/*.md'], 'docx', outputfile="somefile.docx")
```


In addition to `format`, it is possible to pass `extra_args`.
That makes it possible to access various pandoc options easily.

Expand Down
80 changes: 68 additions & 12 deletions pypandoc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import sys
import tempfile
import textwrap
from turtle import TurtleScreenBase
import glob

from .handler import _check_log_handler
from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc
Expand Down Expand Up @@ -68,12 +68,12 @@ def convert_text(source:str, to:str, format:str, extra_args:Iterable=(), encodin
cworkdir=cworkdir)


def convert_file(source_file:str, to:str, format:Union[str, None]=None, extra_args:Iterable=(), encoding:str='utf-8',
def convert_file(source_file:Union[list, str], to:str, format:Union[str, None]=None, extra_args:Iterable=(), encoding:str='utf-8',
outputfile:Union[None, str]=None, filters:Union[Iterable, None]=None, verify_format:bool=True,
sandbox:bool=True, cworkdir:Union[str, None]=None) -> str:
"""Converts given `source` from `format` to `to`.

:param str source_file: file path (see encoding)
:param (str, list) source_file: Either a full file path, relative file path, a file patterh (like dir/*.md), or a list if file or file patterns.

:param str to: format into which the input should be converted; can be one of
`pypandoc.get_pandoc_formats()[1]`
Expand Down Expand Up @@ -107,14 +107,39 @@ def convert_file(source_file:str, to:str, format:Union[str, None]=None, extra_ar
"""
if not _identify_path(source_file):
raise RuntimeError("source_file is not a valid path")
format = _identify_format_from_path(source_file, format)
return _convert_input(source_file, format, 'path', to, extra_args=extra_args,
if _is_network_path(source_file): # if the source_file is an url
format = _identify_format_from_path(source_file, format)
return _convert_input(source_file, format, 'path', to, extra_args=extra_args,
outputfile=outputfile, filters=filters,
verify_format=verify_format, sandbox=sandbox,
cworkdir=cworkdir)

discovered_source_files = []
if isinstance(source_file, str):
discovered_source_files += glob.glob(source_file)
if isinstance(source_file, list): # a list of possibly file or file patterns. Expand all with glob
for filepath in source_file:
discovered_source_files.extend(glob.glob(filepath))
if len(discovered_source_files) == 1: # behavior for a single file or a pattern
format = _identify_format_from_path(discovered_source_files[0], format)
return _convert_input(discovered_source_files[0], format, 'path', to, extra_args=extra_args,
outputfile=outputfile, filters=filters,
verify_format=verify_format, sandbox=sandbox,
cworkdir=cworkdir)
else: # behavior for multiple files or file patterns
format = _identify_format_from_path(discovered_source_files[0], format)
return _convert_input(discovered_source_files, format, 'path', to, extra_args=extra_args,
outputfile=outputfile, filters=filters,
verify_format=verify_format, sandbox=sandbox,
cworkdir=cworkdir)

def _identify_path(source:str) -> bool:

def _identify_path(source) -> bool:
if isinstance(source, list):
for single_source in source:
if not _identify_path(single_source):
return False
return True
is_path = False
try:
is_path = os.path.exists(source)
Expand All @@ -124,6 +149,15 @@ def _identify_path(source:str) -> bool:
# still false
pass

if not is_path:
try:
is_path = len(glob.glob(source)) >= 1
except UnicodeEncodeError:
is_path = len(glob.glob(source.encode('utf-8'))) >= 1
except: # noqa
# still false
pass

if not is_path:
try:
# check if it's an URL
Expand All @@ -140,6 +174,21 @@ def _identify_path(source:str) -> bool:

return is_path

def _is_network_path(source):
try:
# check if it's an URL
result = urlparse(source)
if result.scheme in ["http", "https"]:
return True
elif result.scheme and result.netloc and result.path:
# complete uri including one with a network path
return True
elif result.scheme == "file" and result.path:
return os.path.exists(url2path(source))
except AttributeError:
pass
return False


def _identify_format_from_path(sourcefile:str, format:str) -> str:
return format or os.path.splitext(sourcefile)[1].strip('.')
Expand Down Expand Up @@ -242,7 +291,13 @@ def _convert_input(source, format, input_type, to, extra_args=(),
to = normalize_format(to)

string_input = input_type == 'string'
input_file = [source] if not string_input else []
if not string_input:
if isinstance(source, str):
input_file = [source]
else:
input_file = source
else:
input_file = []
args = [__pandoc_path, '--from=' + format]

args.append('--to=' + to)
Expand Down Expand Up @@ -294,11 +349,12 @@ def _convert_input(source, format, input_type, to, extra_args=(),
p.stderr.read())
)

try:
source = cast_bytes(source, encoding='utf-8')
except (UnicodeDecodeError, UnicodeEncodeError):
# assume that it is already a utf-8 encoded string
pass
if string_input:
try:
source = cast_bytes(source, encoding='utf-8')
except (UnicodeDecodeError, UnicodeEncodeError):
# assume that it is already a utf-8 encoded string
pass
try:
stdout, stderr = p.communicate(source if string_input else None)
except OSError:
Expand Down
19 changes: 19 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,25 @@ def test_basic_conversion_from_file(self):
received = pypandoc.convert_file(file_name, 'rst')
self.assertEqualExceptForNewlineEnd(expected, received)

def test_basic_conversion_from_multiple_files(self):
with closed_tempfile('.md', text='some title') as file_name1:
with closed_tempfile('.md', text='some title') as file_name2:
expected = '<p>some title</p>\n<p>some title</p>'
received = pypandoc.convert_file([file_name1,file_name2], 'html')
self.assertEqualExceptForNewlineEnd(expected, received)

def test_basic_conversion_from_file_pattern(self):
received = pypandoc.convert_file("./*.md", 'html')
received = received.lower()
assert "making a release" in received
assert "pypandoc provides a thin wrapper" in received

def test_basic_conversion_from_file_pattern_with_input_list(self):
received = pypandoc.convert_file(["./*.md", "./*.md"], 'html')
received = received.lower()
assert "making a release" in received
assert "pypandoc provides a thin wrapper" in received

@unittest.skipIf(sys.platform.startswith("win"), "File based urls do not work on windows: "
"https://github.com/jgm/pandoc/issues/4613")
def test_basic_conversion_from_file_url(self):
Expand Down