Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
kevin2li committed Jul 13, 2023
1 parent ccec922 commit dcb9629
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 20 deletions.
6 changes: 5 additions & 1 deletion app.go
Original file line number Diff line number Diff line change
Expand Up @@ -822,13 +822,17 @@ func (a *App) WriteBookmarkByFile(inFile string, outFile string, tocFile string,
return nil
}

func (a *App) WriteBookmarkByGap(inFile string, outFile string, gap int, format string) error {
func (a *App) WriteBookmarkByGap(inFile string, outFile string, gap int, format string, startNumber int, pages string) error {
logger.Printf("inFile: %s, outFile: %s, gap: %d\n", inFile, outFile, gap)
if _, err := os.Stat(inFile); os.IsNotExist(err) {
logger.Println(err)
return err
}
args := []string{"bookmark", "add", "--method", "gap"}
if pages != "" {
args = append(args, "--page_range", pages)
}
args = append(args, "--start-number", fmt.Sprintf("%d", startNumber))
args = append(args, "--gap", fmt.Sprintf("%d", gap))
if format != "" {
args = append(args, "--format", format)
Expand Down
10 changes: 9 additions & 1 deletion frontend/src/components/Forms/BookmarkForm.vue
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,16 @@
<a-form-item name="write_offset" label="页码偏移量" v-if="formState.write_type == 'file'">
<a-input-number v-model:value="formState.write_offset" />
</a-form-item>
<a-form-item name="page" label="页码范围" hasFeedback :validateStatus="validateStatus.page"
:help="validateHelp.page">
<a-input v-model:value="formState.page" placeholder="e.g. 3-N (留空表示全部页面)" allow-clear />
</a-form-item>
<a-form-item name="write_gap" label="间隔页数" v-if="formState.write_type == 'page'">
<a-input-number v-model:value="formState.write_gap" />
</a-form-item>
<a-form-item name="start_number" label="起始编号" v-if="formState.write_type == 'page'">
<a-input-number v-model:value="formState.start_number"/>
</a-form-item>
<a-form-item name="bookmark.write_format" label="命名格式" v-if="formState.write_type == 'page'">
<a-input v-model:value="formState.write_format" placeholder="e.g. 第%p页(%p表示页码)" allow-clear />
</a-form-item>
Expand Down Expand Up @@ -208,6 +215,7 @@ export default defineComponent({
default_level: 1,
remove_blank_lines: true,
recognize_type: "font",
start_number: 1,
});
const indentItems = reactive<{ items: IndentItem[] }>({
Expand Down Expand Up @@ -330,7 +338,7 @@ export default defineComponent({
break;
}
case "page": {
await handleOps(WriteBookmarkByGap, [formState.input, formState.output, formState.write_gap, formState.write_format]);
await handleOps(WriteBookmarkByGap, [formState.input, formState.output, formState.write_gap, formState.write_format, formState.start_number, formState.page]);
break;
}
}
Expand Down
1 change: 1 addition & 0 deletions frontend/src/components/data.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ interface BookmarkState {
default_level: number;
remove_blank_lines: boolean;
recognize_type: string;
start_number: number;
}

interface OcrState {
Expand Down
2 changes: 1 addition & 1 deletion frontend/wailsjs/go/main/App.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,4 @@ export function WatermarkPDFByText(arg1:string,arg2:string,arg3:string,arg4:stri

export function WriteBookmarkByFile(arg1:string,arg2:string,arg3:string,arg4:number):Promise<void>;

export function WriteBookmarkByGap(arg1:string,arg2:string,arg3:number,arg4:string):Promise<void>;
export function WriteBookmarkByGap(arg1:string,arg2:string,arg3:number,arg4:string,arg5:number,arg6:string):Promise<void>;
4 changes: 2 additions & 2 deletions frontend/wailsjs/go/main/App.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,6 @@ export function WriteBookmarkByFile(arg1, arg2, arg3, arg4) {
return window['go']['main']['App']['WriteBookmarkByFile'](arg1, arg2, arg3, arg4);
}

export function WriteBookmarkByGap(arg1, arg2, arg3, arg4) {
return window['go']['main']['App']['WriteBookmarkByGap'](arg1, arg2, arg3, arg4);
export function WriteBookmarkByGap(arg1, arg2, arg3, arg4, arg5, arg6) {
return window['go']['main']['App']['WriteBookmarkByGap'](arg1, arg2, arg3, arg4, arg5, arg6);
}
31 changes: 29 additions & 2 deletions thirdparty/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
import fitz
import matplotlib.pyplot as plt
import numpy as np
from paddleocr import PaddleOCR, PPStructure, draw_ocr
from paddleocr import PaddleOCR, PPStructure, draw_ocr, save_structure_res
from paddleocr.ppstructure.recovery.recovery_to_doc import (
convert_info_docx, sorted_layout_boxes)
from PIL import Image
from tqdm import tqdm

Expand Down Expand Up @@ -423,6 +425,30 @@ def extract_item_from_pdf(doc_path: str, page_range: str = 'all', type: str = "f
logger.error(traceback.format_exc())
dump_json(cmd_output_path, {"status": "error", "message": traceback.format_exc()})

def convert_pdf2docx(doc_path: str, lang: str = "ch", dpi: int = 300, page_range: str = "all", output_path: str = None):
try:
doc: fitz.Document = fitz.open(doc_path)
table_engine = PPStructure(recovery=True, lang=lang)
roi_indicies = parse_range(page_range, doc.page_count)
p = Path(doc_path)
if output_path is None:
output_dir = p.parent / "docx"
output_dir.mkdir(parents=True, exist_ok=True)
else:
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
for page_index in roi_indicies:
page = doc[page_index]
pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
img = np.frombuffer(buffer=pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, -1))
result = table_engine(img)
h, w, _ = img.shape
res = sorted_layout_boxes(result, w)
convert_info_docx(img, res, output_dir, f"{p.stem}-{page_index+1}")
except:
logger.error(traceback.format_exc())
dump_json(cmd_output_path, {"status": "error", "message": traceback.format_exc()})

def main():
parser = argparse.ArgumentParser()
sub_parsers = parser.add_subparsers()
Expand Down Expand Up @@ -470,4 +496,5 @@ def main():
extract_item_from_pdf(doc_path=args.input_path, page_range=args.page_range, type=args.type, output_dir=args.output)

if __name__ == "__main__":
main()
# main()
convert_pdf2docx(r"C:\Users\kevin\Desktop\书签测试\计算机网络-目录_提取.pdf")
23 changes: 10 additions & 13 deletions thirdparty/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,14 +868,17 @@ def add_toc_from_file(toc_path: str, doc_path: str, offset: int, output_path: st
dump_json(cmd_output_path, {"status": "error", "message": traceback.format_exc()})

@batch_process
def add_toc_by_gap(doc_path: str, gap: int = 1, format: str = "第%p页", output_path: str = None):
def add_toc_by_gap(doc_path: str, gap: int = 1, format: str = "第%p页", start_number: int = 1, page_range: str = "all", output_path: str = None):
try:
doc: fitz.Document = fitz.open(doc_path)
p = Path(doc_path)
toc = []
for i in range(0, doc.page_count, gap):
toc.append([1, format.replace("%p", str(i+1)), i+1])
toc.append([1, format.replace("%p", str(doc.page_count)), doc.page_count])
roi_indicies = parse_range(page_range, doc.page_count)
n = len(roi_indicies)
for i in range(0, n, gap):
toc.append([1, format.replace("%p", str(start_number)), roi_indicies[i]+1])
start_number += gap
# toc.append([1, format.replace("%p", str(doc.page_count)), doc.page_count])
doc.set_toc(toc)
if output_path is None:
output_path = str(p.parent / f"{p.stem}-[页码书签版].pdf")
Expand Down Expand Up @@ -2370,6 +2373,8 @@ def main():
### 页码书签
bookmark_add_parser.add_argument("--gap", type=int, default=1, help="页码间隔")
bookmark_add_parser.add_argument("--format", type=str, default="第%p页", help="页码格式")
bookmark_add_parser.add_argument("--page_range", type=str, default="all", help="页码范围")
bookmark_add_parser.add_argument("--start-number", type=int, default=1, help="起始编号")

## 提取书签
bookmark_extract_parser = bookmark_sub_parsers.add_parser("extract", help="提取书签")
Expand Down Expand Up @@ -2630,7 +2635,7 @@ def main():
if args.method == "file":
add_toc_from_file(toc_path=args.toc, doc_path=args.input_path, offset=args.offset, output_path=args.output)
elif args.method == "gap":
add_toc_by_gap(doc_path=args.input_path, gap=args.gap, format=args.format, output_path=args.output)
add_toc_by_gap(doc_path=args.input_path, gap=args.gap, format=args.format, start_number=args.start_number, page_range=args.page_range, output_path=args.output)
elif args.bookmark_which == "extract":
extract_toc(doc_path=args.input_path, format=args.format, output_path=args.output)
elif args.bookmark_which == "transform":
Expand Down Expand Up @@ -2726,11 +2731,3 @@ def main():

if __name__ == "__main__":
main()
# find_title_by_rect_annot(r"C:\Users\kevin\Desktop\书签测试\2023考研英语一真题-去水印版.pdf")
# find_title_by_rect_annot(r"C:\Users\kevin\Desktop\书签测试\迅捷PDF编辑器v2.0使用手册-去水印版.pdf", "3-N")
# find_title_by_rect_annot(r"C:\Users\kevin\Desktop\书签测试\2022-中国计算机学会推荐国际学术会议和期刊目录.pdf")
# find_title_by_rect_annot(r"C:\Users\kevin\Desktop\书签测试\Computer Networking_ A Top-Down Approach, Global Edition, 8th Edition.pdf", "33-N")
# find_title_by_rect_annot(r"C:\Users\kevin\Desktop\书签测试\项目任务书(最终签字版).pdf", "11-13")
# find_title_by_rect_annot(r"C:\Users\kevin\Desktop\书签测试\SQL必知必会(第5版).pdf", "18-N")
# find_title_by_rect_annot(r"C:\Users\kevin\Desktop\书签测试\汤书操作系统课本.pdf", "10-N")
# extract_encrypt_pdf_hash(r"C:\Users\kevin\Downloads\Detecting Ponzi Schemes on Ethereum Proceedings of the 2018 World Wide Web-加密.pdf")

0 comments on commit dcb9629

Please sign in to comment.