-
Notifications
You must be signed in to change notification settings - Fork 3
/
exporter.py
702 lines (567 loc) · 23.5 KB
/
exporter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
import argparse
import json
import logging
import os
from datetime import datetime
from logging import info, error, debug
from pathlib import Path
import re
import sys
from typing import Dict, List, Union
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
import urllib.parse
import base64
from time import time
from time import sleep
# (formatName, fileExtension)
FORMATS: Dict['str', 'str'] = {
'markdown': 'md',
'plaintext': 'txt',
'pdf': 'pdf',
'html': 'html'
}
LEVELS = ['pages', 'chapters', 'books']
LOG_LEVEL: Dict = {
'debug': logging.DEBUG,
'info': logging.INFO,
'warning': logging.WARNING,
'error': logging.ERROR
}
# Characters in filenames to be replaced with "_"
FORBIDDEN_CHARS: List[str] = ["/", "#"]
parser = argparse.ArgumentParser(description='BookStack exporter')
parser.add_argument('-p',
'--path',
type=str,
default='.',
help='Path where exported files will be placed.')
parser.add_argument(
'-t',
'--token-file',
type=str,
default=f'.{os.path.sep}token.txt',
help='File containing authorization token in format TOKEN_ID:TOKEN_SECRET')
parser.add_argument(
'-H',
'--host',
type=str,
default='https://localhost',
help='Your domain with protocol prefix, example: https://example.com')
parser.add_argument('-f',
'--formats',
type=str,
default=['markdown'],
nargs="+",
help='Space separated list of formats to use for export.',
choices=FORMATS.keys())
parser.add_argument('--rate-limit',
type=int,
default=180,
help='How many api requests can be made in a minute. '
'Default is 180 (BookStack defaults)')
parser.add_argument('-c',
'--forbidden-chars',
type=str,
default=FORBIDDEN_CHARS,
nargs="+",
help='Space separated list of symbols to be replaced '
'with "_" in filenames.')
parser.add_argument('-u',
'--user-agent',
type=str,
default="BookStack exporter",
help='User agent header content. In situations'
' where requests are blocked because of bad client/'
'unrecognized web browser/etc (like with CloudFlare'
' tunnels), change to some typical '
'web browser user agent header.')
parser.add_argument('--additional-headers',
type=str,
nargs="+",
default=[],
help='List of arbitrary additional HTTP headers to be '
'sent with every HTTP request. They can override default'
' ones, including Authorization header. IMPORTANT: '
'these headers are also sent when downloading external '
'attachments! Don\'t put here any private data.'
'Example: -u "Header1: value1" "Header2: value2"')
parser.add_argument(
'-l',
'--level',
type=str,
default=['pages'],
nargs="+",
help="Space separated list of levels at which should be export "
"performed. ",
choices=LEVELS)
parser.add_argument(
'--force-update-files',
action='store_true',
default=False,
help="Set this option to skip checking local files timestamps against "
"remote last edit timestamps. This will cause overwriting local files,"
" even if they seem to be already in newest version.")
parser.add_argument(
'--images',
action='store_true',
default=False,
help="Download images and place them in dedicated directory in export path"
" root, preserving their internal paths")
parser.add_argument(
'--markdown-images',
action='store_true',
default=False,
help="The same as --images, but will also update image links in "
"exported markdown files (if they are bein exported)."
" Warning: this is experimental, as API does not provide a way to "
"know what images are actually on the page. Therefore for markdown data"
" all ']({URL}' occurences will be replaced with local, relative "
"path to images, and additionally any '/scaled-\\d+-/' regex match"
" will be replaced with '/' so that scaled images are also displayed")
parser.add_argument('--images-dir',
type=str,
default="exported-images",
help='When exporting images, they will be organized in'
' directory located at the same path as exported document.'
' This parameter defines name of this directory.')
parser.add_argument('--skip-broken-image-links',
default=False,
action='store_true',
help="Don't fail and skip downloading images if their "
"url obtained from images gallery API seem broken "
"(image cannot be downloaded OR fails to download).")
parser.add_argument('--dont-export-attachments',
default=False,
action='store_true',
help="Set this to prevent exporting any attachments.")
parser.add_argument(
'--dont-export-external-attachments',
action='store_true',
default=False,
help="Set this to prevent exporting external attachments (from links).")
parser.add_argument('-V',
'--log-level',
type=str,
default='info',
help='Set verbosity level.',
choices=LOG_LEVEL.keys())
args = parser.parse_args()
def removesuffix(text, suffix):
"""Remove suffix from text if matched."""
if text.endswith(suffix):
return text[:len(text) - len(suffix)]
return text
logging.basicConfig(format='%(levelname)s :: %(message)s',
level=LOG_LEVEL.get(args.log_level))
formats: List[str] = args.formats
FORBIDDEN_CHARS = args.forbidden_chars
for frmt in formats:
if frmt not in FORMATS:
error("Unknown format name (NOT file extension), "
"check api docs for current version of your BookStack")
sys.exit(1)
API_PREFIX: str = f"{removesuffix(args.host, os.path.sep)}/api"
FS_PATH: str = removesuffix(args.path, os.path.sep)
LEVEL_CHOICE: List[str] = args.level
for lvl in LEVEL_CHOICE:
if lvl not in LEVELS:
error(f"Level {lvl} is not supported, can be only one of {LEVELS}")
sys.exit(1)
with open(args.token_file, 'r', encoding='utf-8') as f:
TOKEN: str = removesuffix(f.readline(), '\n')
HEADERS = {
'Content-Type': 'application/json; charset=utf-8',
'Authorization': f"Token {TOKEN}",
'User-Agent': args.user_agent
}
HEADERS_NO_TOKEN = {
'Content-Type': 'application/json; charset=utf-8',
'User-Agent': args.user_agent
}
for header in args.additional_headers:
values = header.split(':', 1)
if len(values) < 2:
raise ValueError(f"Improper HTTP header specification: {header}")
HEADERS[values[0]] = values[1]
HEADERS_NO_TOKEN[values[0]] = values[1]
SKIP_TIMESTAMPS: bool = args.force_update_files
SKIP_BROKEN_IMAGE_LINKS: bool = args.skip_broken_image_links
class ApiRateLimiter:
def __init__(self, rate_limit: int) -> None:
self.__rate_limit = rate_limit
info(f"API rate limit: {self.__rate_limit}/min")
self.__requests_times: List[float] = []
def limit_rate_request(self):
"""Count another request and wait minimal required time if limit is reached."""
current_time = time()
self.__requests_times.append(current_time)
# filter out requests older than 60s ago
self.__requests_times = list(
filter(lambda x: current_time - x <= 60, self.__requests_times))
# sleep until oldest remembered request is more than 60s ago
if len(self.__requests_times) > self.__rate_limit:
wait_time = self.__requests_times[0] + 60 - current_time
info(f"API Rate limit reached, waiting {round(wait_time, 2)}s")
sleep(wait_time)
api_rate_limiter = ApiRateLimiter(args.rate_limit)
class Node:
"""Clas representing any node in whole bookstack documents "tree"."""
def __init__(self, name: str, parent: Union['Node', None], node_id: int,
last_edit_timestamp: datetime):
for char in FORBIDDEN_CHARS:
name = name.replace(char, "_")
self.__name: str = name
self.__children: List['Node'] = []
self.__parent: Union['Node', None] = parent
if parent is not None:
parent.add_child(self)
self.__last_edit_timestamp: datetime = last_edit_timestamp
self.__node_id = node_id
@property
def name(self) -> str:
"""Return name of this Shelf/Book/Chapter/Page."""
return self.__name
@property
def parent(self) -> Union['Node', None]:
"""Return parent Node or None if there isn't any."""
return self.__parent
def changed_since(self, timestamp: datetime) -> int:
"""
Check if remote version have changed after given timestamp,
including its children
:param timestamp:
:return: amount of changed documents at level of this document Node
"""
result: int = 0
if self.__last_edit_timestamp > timestamp:
result += 1
for child in self.__children:
result += child.changed_since(timestamp)
return result
def get_last_edit_timestamp(self) -> datetime:
return self.__last_edit_timestamp
def set_parent(self, parent: 'Node'):
self.__parent = parent
parent.add_child(self)
def add_child(self, child: 'Node'):
self.__children.append(child)
def get_all_ids(self) -> List[int]:
"""Return list containing id of this node, and all child nodes."""
ids = [self.get_id()]
for child in self.__children:
child_ids = child.get_all_ids()
for id in child_ids:
ids.append(id)
return ids
def get_path(self) -> str:
if self.__parent is None:
return "."
return self.__parent.get_path() + os.path.sep + self.__parent.name
def get_id(self) -> int:
return self.__node_id
def parents_levels(self) -> int:
"""Calculate nesting level of this Node."""
if self.__parent is not None:
return 1 + self.__parent.parents_levels()
return 0
class AttachedFile(Node):
def __init__(self, name: str, parent_id: int, url: str, path: str,
node_id: int, last_edit_timestamp: datetime):
"""
name: filename
parent_id: uploaded_to value from api
url: http url directly for file download
path: path value of the object from api (filepath)
node_id: id of the object
last_edit_timestamp: timestamp from updated_at api field
"""
super().__init__(name, None, node_id, last_edit_timestamp)
self.__parent_id = parent_id
self.__url = url
self.__path = path
def get_parent_id(self) -> int:
return self.__parent_id
def get_url(self) -> str:
return self.__url
def get_path(self) -> str:
"""Path value of the object from api."""
return self.__path
shelves: Dict[int, Node] = {}
books: Dict[int, Node] = {}
chapters: Dict[int, Node] = {}
pages: Dict[int, Node] = {}
pages_not_in_chapter: Dict[int, Node] = {}
attachments: Dict[int, Node] = {}
images: Dict[int, AttachedFile] = {}
def api_timestamp_string_to_datetime(timestamp: str) -> datetime:
return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')
def make_dir(path: str):
path_obj = Path(path)
if path_obj.exists():
return
info(f"Creating dir {path}")
path_obj.mkdir(exist_ok=True, parents=True)
def api_get_bytes(path: str, raw_url: bool = False, **kwargs) -> bytes:
"""
Retrieve bytes on specific relative api path.
If raw_url is set to true, it will be accessed directly, without
prefixing with base api url.
"""
request_path: str = f'{API_PREFIX}/{path}'
if raw_url:
request_path = path
if len(kwargs) > 0:
params: str = urllib.parse.urlencode(kwargs)
request_path += f"?{params}"
debug(f"Making http request: {request_path}")
request: Request = Request(request_path, headers=HEADERS)
api_rate_limiter.limit_rate_request()
with urlopen(request) as response:
if response.status == 403:
error("403 Forbidden, check your token!")
sys.exit(response.status)
return response.read()
def api_get_dict(path: str) -> dict:
"""Make api request at specified path and return result as dict."""
data = api_get_bytes(path).decode()
return json.loads(data)
def api_get_listing(path: str) -> list:
"""Retrieve whole lists through api.
Request for another 50 until have collected "total" amount.
:param path:
:return:
"""
count: int = 50
total: int = count
result: list = []
while total > len(result):
data: dict = json.loads(
api_get_bytes(path, count=count, offset=len(result)))
total = data['total']
result += data['data']
debug(f"API listing got {len(result)} items out of maximum {count}")
return result
def image_translate_path(img_path: str) -> str:
"""Update remote path attribute string to be local image path.
img_path: image 'path' attribute from api
"""
return f"{FS_PATH}{os.path.sep}{args.images_dir}{img_path}"
def check_if_update_needed(file_path: str, document: Node) -> bool:
"""Check if a Node need updating on disk, according to timestamps."""
if SKIP_TIMESTAMPS:
return True
debug(f"Checking for update for file {file_path}")
if not os.path.exists(file_path):
debug(f"Document {file_path} is missing on disk, update needed.")
return True
local_last_edit: datetime = datetime.fromtimestamp(
os.path.getmtime(file_path))
remote_last_edit: datetime = document.get_last_edit_timestamp()
debug("Local file creation timestamp: "
f"{local_last_edit.date()} {local_last_edit.time()}, "
"remote edit timestamp: "
f"{remote_last_edit.date()} {remote_last_edit.time()}")
changes: int = document.changed_since(local_last_edit)
if changes > 0:
info(f"Document \"{file_path}\" consists of {changes} "
"outdated documents, update needed.")
return True
debug(f"Document \"{file_path}\" consists of {changes} "
"outdated documents, skipping updating.")
return False
def update_markdown_image_tags(doc: Node, data: bytes) -> bytes:
"""Update all image tags to point to exported images in given markdown data."""
levels = doc.parents_levels()
# "](" is a part of markdown image tag, used here to
# try preventing replacing host url in other paces
dir_fallback = ']('
dir_fallback += '../' * levels
host = removesuffix(args.host, '/')
dir_fallback += args.images_dir
data = data.replace(f']({host}'.encode(), dir_fallback.encode())
data_str = re.sub(r'/scaled-\d+-/', r'/', data.decode())
return data_str.encode()
def export_doc(documents: List[Node], level: str):
"""Save document-like Nodes to files."""
for document in documents:
make_dir(f"{FS_PATH}{os.path.sep}{document.get_path()}")
for v_format in formats:
path: str = f"{FS_PATH}{os.path.sep}{document.get_path()}" + \
f"{os.path.sep}{document.name}.{FORMATS[v_format]}"
if not check_if_update_needed(path, document):
continue
data: bytes = api_get_bytes(
f'{level}/{document.get_id()}/export/{v_format}')
if args.markdown_images and v_format == 'markdown':
data = update_markdown_image_tags(document, data)
with open(path, 'wb') as file:
info(f"Saving {path}")
file.write(data)
def export_attachments(attachments: List[Node]):
"""Save attachment Nodes to files."""
for attachment in attachments:
base_path = attachment.get_path()
if attachment.parent is None:
base_path = f'__ATTACHMENTS_FROM_DELETED_PAGES__{os.path.sep}{base_path}'
make_dir(f"{FS_PATH}{os.path.sep}{base_path}")
path: str = f"{FS_PATH}{os.path.sep}{base_path}" + \
f"{os.path.sep}{attachment.name}"
if not check_if_update_needed(path, attachment):
continue
data = api_get_bytes(f'attachments/{attachment.get_id()}')
data = json.loads(data)
content = data['content']
content_url = urllib.parse.urlparse(content)
if content_url.scheme:
if args.dont_export_external_attachments:
continue
info(f"Downloading attachment from url: {content_url.geturl()}")
request: Request = Request(content_url.geturl(),
headers=HEADERS_NO_TOKEN)
with urlopen(request) as response:
if response.status >= 300:
error(
"Could not download link-type attachment from "
f"'{content_url.geturl()}, got code {response.status}'!"
)
sys.exit(response.status)
with open(path, 'wb') as file:
info(f"Saving {path}")
file.write(response.read())
else:
with open(path, 'wb') as file:
info(f"Saving {path}")
file.write(base64.b64decode(content))
def export_images():
for img in images.values():
path = image_translate_path(img.get_path())
img_dir = os.path.dirname(path)
make_dir(img_dir)
if not check_if_update_needed(path, img):
continue
try:
data: bytes = api_get_bytes(img.get_url(), raw_url=True)
except (URLError, HTTPError) as exc:
error(f"Failed downloading image '{img.get_url()}': {exc}")
if not SKIP_BROKEN_IMAGE_LINKS:
sys.exit(1)
else:
continue
with open(path, 'wb') as file:
info(f"Saving {path}")
file.write(data)
#########################
# Gathering data from api
#########################
info("Getting info about Shelves and their Books")
for shelf_data in api_get_listing('shelves'):
last_edit_ts: datetime = api_timestamp_string_to_datetime(
shelf_data['updated_at'])
shelf = Node(shelf_data.get('name'), None, shelf_data.get('id'),
last_edit_ts)
debug(f"Shelf: \"{shelf.name}\", ID: {shelf.get_id()}")
shelves[shelf.get_id()] = shelf
shelf_details = api_get_dict(f'shelves/{shelf.get_id()}')
if shelf_details.get('books') is None:
continue
for book_data in shelf_details['books']:
last_edit_ts: datetime = api_timestamp_string_to_datetime(
book_data['updated_at'])
book = Node(book_data.get('name'), shelf, book_data.get('id'),
last_edit_ts)
debug(f"Book: \"{book.name}\", ID: {book.get_id()}")
books[book.get_id()] = book
info("Getting info about Books not belonging to any shelf")
for book_data in api_get_listing('books'):
if book_data.get('id') in books:
continue
last_edit_ts: datetime = api_timestamp_string_to_datetime(
book_data['updated_at'])
book = Node(book_data.get('name'), None, book_data.get('id'), last_edit_ts)
debug(f"Book: \"{book.name}\", ID: {book.get_id()}, "
f"last edit: {book.get_last_edit_timestamp()}")
info(f"Book \"{book.name} has no shelf assigned.\"")
books[book.get_id()] = book
info("Getting info about Chapters")
for chapter_data in api_get_listing('chapters'):
last_edit_ts: datetime = api_timestamp_string_to_datetime(
chapter_data['updated_at'])
chapter = Node(chapter_data.get('name'),
books.get(chapter_data.get('book_id')),
chapter_data.get('id'), last_edit_ts)
debug(f"Chapter: \"{chapter.name}\", ID: {chapter.get_id()},"
f" last edit: {chapter.get_last_edit_timestamp()}")
chapters[chapter.get_id()] = chapter
info("Getting info about Pages")
for page_data in api_get_listing('pages'):
parent_id = page_data.get('chapter_id')
last_edit_ts: datetime = api_timestamp_string_to_datetime(
page_data['updated_at'])
if parent_id not in chapters:
parent = books[page_data['book_id']]
page = Node(page_data.get('name'), parent, page_data.get('id'),
last_edit_ts)
info(f"Page \"{page.name}\" is not in any chapter, "
f"using Book \"{parent.name}\" as a parent.")
debug(f"Page: \"{page.name}\", ID: {page.get_id()},"
f" last edit: {page.get_last_edit_timestamp()}")
pages[page.get_id()] = page
pages_not_in_chapter[page.get_id()] = page
continue
page = Node(page_data.get('name'), chapters.get(parent_id),
page_data.get('id'), last_edit_ts)
debug(f"Page: \"{page.name}\", ID: {page.get_id()}, "
f"last edit: {page.get_last_edit_timestamp()}")
pages[page.get_id()] = page
if not args.dont_export_attachments:
info("Getting info about Attachments.")
for attachment_data in api_get_listing('attachments'):
last_edit_ts: datetime = api_timestamp_string_to_datetime(
attachment_data['updated_at'])
all_pages = {}
all_pages.update(pages)
all_pages.update(pages_not_in_chapter)
attachment = Node(attachment_data.get('name'),
all_pages.get(attachment_data.get('uploaded_to')),
attachment_data.get('id'), last_edit_ts)
debug(f"Attachment: \"{attachment.name}\", ID: {attachment.get_id()},"
f" last edit: {attachment.get_last_edit_timestamp()}")
attachments[attachment.get_id()] = attachment
if args.images or args.markdown_images:
info("Getting info about Image gallery.")
for image_data in api_get_listing('image-gallery'):
last_edit_ts: datetime = api_timestamp_string_to_datetime(
image_data['updated_at'])
image = AttachedFile(name=image_data.get('name'),
parent_id=image_data.get('uploaded_to'),
url=image_data.get('url'),
path=image_data.get('path'),
node_id=image_data.get('id'),
last_edit_timestamp=last_edit_ts)
debug(f"Image: \"{image.name}\", ID: {image.get_id()},"
f" last edit: {image.get_last_edit_timestamp()}")
images[image.get_id()] = image
#########################
# Exporting data from api
#########################
files: List[Node] = []
EXPORT_PAGES_NOT_IN_CHAPTER: bool = False
for lvl in LEVEL_CHOICE:
if lvl == 'pages':
files = list(pages.values())
elif lvl == 'chapters':
files = list(chapters.values())
EXPORT_PAGES_NOT_IN_CHAPTER = True
elif lvl == 'books':
files = list(books.values())
export_doc(files, lvl)
if EXPORT_PAGES_NOT_IN_CHAPTER:
info("Exporting pages that are not in chapter...")
export_doc(list(pages_not_in_chapter.values()), 'pages')
if not args.dont_export_attachments:
export_attachments(list(attachments.values()))
if args.images or args.markdown_images:
export_images()
info("Finished")
sys.exit(0)