Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify collection of raw text #2087

Merged
merged 1 commit into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 11 additions & 29 deletions novelwriter/formats/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,10 @@ def __init__(self, project: NWProject) -> None:
self._project = project

# Data Variables
self._text = "" # The raw text to be tokenized
self._handle = None # The item handle currently being processed
self._keepRaw = False # Whether to keep the raw text, used by ToRaw
self._text = "" # The raw text to be tokenized
self._handle = None # The item handle currently being processed
self._keepRaw = False # Whether to keep the raw text, used by ToRaw
self._noTokens = False # Disable tokenization if they're not needed

# Blocks and Meta Data (Per Document)
self._blocks: list[T_Block] = []
Expand Down Expand Up @@ -522,22 +523,24 @@ def tokenizeText(self) -> None:
4: The internal formatting map of the text, TxtFmt.*
5: The formats of the block, BlockFmt.*
"""
if self._keepRaw:
self._raw.append(f"{self._text.rstrip()}\n\n")
if self._noTokens:
return
if self._isNovel:
self._hFormatter.setHandle(self._handle)

# Cache Flags
isNovel = self._isNovel
keepRaw = self._keepRaw
doJustify = self._doJustify
keepBreaks = self._keepBreaks
indentFirst = self._indentFirst
firstIndent = self._firstIndent

if self._isNovel:
self._hFormatter.setHandle(self._handle)

# Replace all instances of [br] with a placeholder character
text = REGEX_PATTERNS.lineBreak.sub("\uffff", self._text)

nHead = 0
rawText = []
tHandle = self._handle or ""
tBlocks: list[T_Block] = [B_EMPTY]
for bLine in text.splitlines():
Expand All @@ -547,8 +550,6 @@ def tokenizeText(self) -> None:
# Check for blank lines
if not sLine:
tBlocks.append(B_EMPTY)
if keepRaw:
rawText.append("\n")
continue

if self._breakNext:
Expand Down Expand Up @@ -613,14 +614,10 @@ def tokenizeText(self) -> None:
tBlocks.append((
BlockTyp.COMMENT, "", tLine, tFmt, tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif cStyle == nwComment.FOOTNOTE:
tLine, tFmt = self._extractFormats(cText, skip=TextFmt.FNOTE)
self._footnotes[f"{tHandle}:{cKey}"] = (tLine, tFmt)
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith("@"):
# Keywords
Expand All @@ -634,8 +631,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
BlockTyp.KEYWORD, tTag[1:], tLine, tFmt, tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith(("# ", "#! ")):
# Title or Partition Headings
Expand Down Expand Up @@ -670,8 +665,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith(("## ", "##! ")):
# (Unnumbered) Chapter Headings
Expand Down Expand Up @@ -704,8 +697,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith(("### ", "###! ")):
# (Alternative) Scene Headings
Expand Down Expand Up @@ -744,8 +735,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

elif aLine.startswith("#### "):
# Section Headings
Expand Down Expand Up @@ -773,8 +762,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
tType, f"{tHandle}:T{nHead:04d}", tText, [], tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

else:
# Text Lines
Expand Down Expand Up @@ -821,8 +808,6 @@ def tokenizeText(self) -> None:
tBlocks.append((
BlockTyp.TEXT, "", tLine, tFmt, tStyle
))
if keepRaw:
rawText.append(f"{aLine}\n")

# If we have content, turn off the first page flag
if self._isFirst and tBlocks:
Expand All @@ -840,9 +825,6 @@ def tokenizeText(self) -> None:

# Always add an empty line at the end of the file
tBlocks.append(B_EMPTY)
if keepRaw:
rawText.append("\n")
self._raw.append("".join(rawText))

# Second Pass
# ===========
Expand Down
1 change: 1 addition & 0 deletions novelwriter/formats/toraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ToRaw(Tokenizer):
def __init__(self, project: NWProject) -> None:
super().__init__(project)
self._keepRaw = True
self._noTokens = True
return

def doConvert(self) -> None:
Expand Down
5 changes: 3 additions & 2 deletions tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
"meta": {
"projectName": "Lorem Ipsum",
"novelAuthor": "lipsum.com",
"buildTime": 1730136328,
"buildTimeStr": "2024-10-28 18:25:28"
"buildTime": 1731001720,
"buildTimeStr": "2024-11-07 18:48:40"
},
"text": {
"nwd": [
Expand All @@ -21,6 +21,7 @@
">> \u201cThere is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain\u2026\u201d <<"
],
[
"[NEW PAGE]",
"",
"% Exctracted from the lipsum.com website.",
"",
Expand Down
1 change: 1 addition & 0 deletions tests/reference/mBuildDocBuild_NWD_Lorem_Ipsum.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

>> “There is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain…” <<

[NEW PAGE]

% Exctracted from the lipsum.com website.

Expand Down
12 changes: 6 additions & 6 deletions tests/test_core/test_core_docbuild.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,8 +467,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path):
assert docFile.read_text(encoding="utf-8") == (
"#! New Novel\n\n"
">> By Jane Doe <<\n\n"
"## New Chapter\n\n\n"
"### New Scene\n\n\n"
"## New Chapter\n\n"
"### New Scene\n\n"
)
docFile.unlink()

Expand Down Expand Up @@ -497,8 +497,8 @@ def testCoreDocBuild_Custom(mockGUI, fncPath: Path):
assert docFile.read_text(encoding="utf-8") == (
"#! New Novel\n\n"
">> By Jane Doe <<\n\n"
"## New Chapter\n\n\n"
"### New Scene\n\n\n"
"## New Chapter\n\n"
"### New Scene\n\n"
)
docFile.unlink()

Expand Down Expand Up @@ -621,8 +621,8 @@ def testCoreDocBuild_IterBuild(mockGUI, fncPath: Path, mockRnd):
assert docFile.read_text(encoding="utf-8") == (
"#! New Novel\n\n"
">> By Jane Doe <<\n\n"
"## New Chapter\n\n\n"
"### New Scene\n\n\n"
"## New Chapter\n\n"
"### New Scene\n\n"
"#! Notes: Plot\n\n"
"# Main Plot\n"
"**Text**\n\n"
Expand Down
Loading