Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/error_check: add path arguments check and simplify path process. #15

Merged
merged 8 commits into from
Mar 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitpod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ tasks:
vscode:
extensions:
- ms-python.python

- cweijan.vscode-office

- ms-python.black-formatter
- eamodio.gitlens
15 changes: 9 additions & 6 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
logger.remove()
logger.add(sys.stderr, level="DEBUG")

if args.input and args.output:
edit_pdf(args.input, args.output, args.ignore)
elif args.indir and args.outdir:
batch_edit_pdf(args.indir, args.outdir, args.ignore)
else:
get_parser().print_help()
try:
if args.input and args.output:
edit_pdf(args.input, args.output, args.ignore)
elif args.indir and args.outdir:
batch_edit_pdf(args.indir, args.outdir, args.ignore)
else:
get_parser().print_help()
except:
exit(1)
5 changes: 2 additions & 3 deletions pdf_white_cut/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def extract_item_box(item):
elif isinstance(item, LTCurve):
logger.debug("use itself: {}", item)
elif isinstance(item, LTTextBox):
logger.warning("NotImplemented and use itself: {}", item)
logger.warning("use itself since NotImplemented: {}", item)
elif isinstance(item, LTTextLine):
# there is 2 types of `LTTextLine`: horizontal and vertical
text = item.get_text().encode("unicode_escape")
Expand Down Expand Up @@ -91,7 +91,7 @@ def extract_item_box(item):
bbox[3] + item.height / 2,
)
elif isinstance(item, LTImage):
logger.warning("NotImplemented and use itself: {}", item)
logger.warning("use itself since NotImplemented: {}", item)
elif isinstance(item, LTFigure):
logger.debug("analyse LTFigure:{}", item)
# for `LTFigure`, the bbox is modified in `PDFMiner`
Expand Down Expand Up @@ -160,7 +160,6 @@ def extract_pdf_boxs(filename, ignore=0):
boxs.append(box)

max_box = get_max_box(boxs)
logger.warning("visible bbox: {}", max_box)
page_boxs.append(max_box)

logger.warning("max visible bbox for the page: {}", max_box)
Expand Down
40 changes: 21 additions & 19 deletions pdf_white_cut/cutter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,21 @@ def edit_page_box(page, visible_box):
logger.info("cut media box to: {}", box)


def edit_pdf(source: str, target: str, ignore=0):
def edit_pdf(source: Path, target: Path, ignore=0):
"""
edit to cut the white slide of the input pdf file, and output a new pdf file.
"""
# guard type
source = Path(source)
target = Path(target)

if source == target:
logger.error("{} {}", source, target)
raise Exception("input and output can not be the same!")

if not source.exists():
raise Exception("input file not exists! ({})".format(source))

try:
# MENTION: never move and change the sequence, since file IO.
# analyses the visible box of each page, aka the box scale. res=[(x1,y1,x2,y2)]
Expand All @@ -66,39 +73,34 @@ def edit_pdf(source: str, target: str, ignore=0):
edit_page_box(page, box)
outpdf.add_page(page)

Path(target).dirname().makedirs_p()
logger.info("output to {}", Path(target))
target.abspath().dirname().makedirs_p()
with open(target, "wb") as outfd:
outpdf.write(outfd)
logger.info("output file: {}", target)

except UnicodeEncodeError as ue:
logger.exception("UnicodeEncodeError while processing file:{}", source)
logger.exception(ue)
raise ue
except Exception as e:
logger.exception("Some other Error while processing file:{}", source)
logger.exception("Some unknown Error while processing file:{}", source)
logger.exception(e)
raise e


def scan_files(folder, glob=""):
"""
scan files under the dir with spec prefix and postfix
"""
files = []
for item in Path(folder).listdir(glob):
item: "Path"
files.append(item.basename())
return files

def batch_edit_pdf(indir: Path, outdir: Path, ignore=0):
# guard type
indir = Path(indir)
outdir = Path(outdir)

def batch_edit_pdf(indir, outdir, ignore=0):
if indir == outdir:
raise Exception("input and output can not be the same!")

files = scan_files(indir, glob="*.pdf")
logger.info(files)
files = [pdf.basename() for pdf in indir.listdir("*.pdf")]
logger.info("pdf files in spec folder: {}", files)

if not os.path.exists(indir):
os.mkdir(indir)
# guard dir
outdir.makedirs_p()

logger.info("input dir: {}", indir)
logger.info("output dir: {}", outdir)
Expand Down
4 changes: 2 additions & 2 deletions pdf_white_cut/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ def get_parser():
"-id",
help="input directory",
action="store",
default="",
default=".",
type=str,
dest="indir",
)
parser.add_argument(
"-od",
help="output directory",
action="store",
default="",
default=".",
type=str,
dest="outdir",
)
Expand Down
4 changes: 4 additions & 0 deletions tests/test_cli.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
python cli.py -i cases/input/input.pdf -o output/output.pdf > /dev/null 2>&1
echo $? "== 0"
python cli.py -i cases/input/non_exist.pdf -o output/output.pdf > /dev/null 2>&1
echo $? "== 1"