Skip to content

Commit

Permalink
Load bad words when loading good words in SQ
Browse files Browse the repository at this point in the history
When adding `good_words.txt` to the project dictionary in Spell
Query, also load `bad_words.txt` if it exists.
Always report bad words as bad spellings, even if they are in the
dictionary or good words file.
Mark bad words with asterisks in SQ report
  • Loading branch information
windymilla committed Sep 24, 2023
1 parent 7cbb8c9 commit 39f421c
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 111 deletions.
1 change: 1 addition & 0 deletions src/guiguts.pl
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@
our %charsuiteenabled = ( 'Basic Latin' => 1 ); # All projects allow Basic Latin character suite
our %pagenumbers;
our %projectdict;
our %projectbadwords;
our %reghints = ();
our %scannoslist;
our %geometryhash; #Geometry of some windows in one hash.
Expand Down
115 changes: 74 additions & 41 deletions src/lib/Guiguts/ErrorCheck.pm
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ sub errorcheckpop_up {
spellquerycleardict(); # Clear cache, so project dictionary will be reloaded below
errorcheckpop_up( $textwindow, $top, $errorchecktype ); # Rerun Spell Query
},
-text => 'Add good_words.txt',
-text => 'Add good/bad words',
-width => 16
)->grid( -row => 1, -column => $gcol - 1 );
my $btnskip = $ptopframeb->Button(
Expand Down Expand Up @@ -564,7 +564,11 @@ sub errorcheckpop_up {
} elsif ( $errorchecktype eq "Spell Query" ) { # Spell query also has a frequency count to append
my $freq = spellqueryfrequency($line);
next if $freq > $::spellquerythreshold; # If it's spelled the same way several times, it's probably not an error
$line .= " ($freq)";
if ( $line =~ s/\*\*\*$// ) { # Spelling flagged as bad word
$line .= " *$freq*";
} else {
$line .= " ($freq)"; # Ordinary misspelling
}

} elsif ( $errorchecktype eq "EPUBCheck" ) {

Expand Down Expand Up @@ -1480,6 +1484,11 @@ sub booklouperun {
my %sqglobaldict = ();
my %sqbadwordfreq = ();

# Codes returned by spellquerywordok()
my $SQWORDOKYES = 0; # Word in dictionary, or meets other criteria for being OK
my $SQWORDOKNO = 1; # Typically a spelling error
my $SQWORDOKBAD = 2; # Actual "bad word" - maybe an OK word, but bad in this project - needs marking specially

#
# Run Spell Query on whole file
sub spellqueryrun {
Expand Down Expand Up @@ -1511,21 +1520,27 @@ sub booklouperun {
$nextcol = $col + length($wd) + 1; # Step forward word length plus single space

next unless $wd; # Empty word if two consecutive separators, e.g. period & space
next if spellquerywordok($wd);
my $wordok = spellquerywordok($wd);
next if $wordok == $SQWORDOKYES;

# If word has leading straight apostrophe, it might be open single quote; trim it and check again
if ( $wd =~ s/^'// ) {
++$col; # Allow for having removed apostrophe from start of word
next if spellquerywordok($wd);
$wordok = spellquerywordok($wd);
next if $wordok == $SQWORDOKYES;
}

# if trailing straight/curly apostrophe, it might be close single quote; trim it and check again
next if $wd =~ s/['$APOS]$//g and spellquerywordok($wd);
next if $wd =~ /^['$APOS]*$/; # OK if nothing left in string but zero or more quotes
if ( $wd =~ s/['$APOS]$//g ) {
$wordok = spellquerywordok($wd);
next if $wordok == $SQWORDOKYES;
}
next if $wd =~ /^['$APOS]*$/; # OK if nothing left in string but zero or more quotes

# Format message - increment word frequency; final total gets appended later when populating dialog
$sqbadwordfreq{$wd}++;
my $error = sprintf( "%d:%-2d - %s", $step, $col, $wd );
my $badwd = $wordok == $SQWORDOKBAD ? "***" : ""; # Flag bad word to higher routine with asterisks
my $error = sprintf( "%d:%-2d - %s%s", $step, $col, $wd, $badwd );
utf8::encode($error);
print $logfile "$error\n";
}
Expand All @@ -1541,77 +1556,94 @@ sub booklouperun {
my $wfword = shift;

for my $wd ( split( /\W/, $wfword ) ) {
return 0 unless spellquerywordok($wd); # part not found
return 0 unless spellquerywordok($wd) == $SQWORDOKYES; # part not found
}
return 1; # all parts ok
return 1; # all parts ok
}

#
# Return true if word is OK, e.g. in dictionary or meets some other criterion
sub spellquerywordok {
my $wd = shift;

# First check if word is in dictionary
return 1 if spellqueryindictapos($wd);
# Return status code if whole thing is a bad word or in the dictionary
my $wordok = spellqueryindictapos($wd);
return $wordok
if $wordok == $SQWORDOKBAD
or $wordok == $SQWORDOKYES;

# Some languages use l', quest', etc., before word - accept if the "prefix" and the main word are both good
# Prefix can be with or without apostrophe ("with" is safer to avoid prefix being accepted if standalone word)
return 1
return $SQWORDOKYES
if $wd =~ /^(\w+)['$APOS](\w+)/
and ( spellqueryindictapos($1) or spellqueryindictapos( $1 . "'" ) )
and spellqueryindictapos($2);
and (spellqueryindictapos($1) == $SQWORDOKYES
or spellqueryindictapos( $1 . "'" ) == $SQWORDOKYES )
and spellqueryindictapos($2) == $SQWORDOKYES;

# Now check numbers
return 1 if $wd =~ /^\d+$/; # word is all digits
return 1 if $wd =~ /^(\d*[02-9])?1st$/i; # ...1st, ...21st, ...31st, etc
return 1 if $wd =~ /^(\d*[02-9])?2n?d$/i; # ...2nd, ...22nd, ...32nd, etc (also 2d, 22d, etc)
return 1 if $wd =~ /^(\d*[02-9])?3r?d$/i; # ...3rd, ...23rd, ...33rd, etc (also 3d, 33d, etc)
return 1 if $wd =~ /^\d*[04-9]th$/i; # ...0th, ...4th, ...5th, etc
return 1 if $wd =~ /^\d*1[123]th$/i; # ...11th, ...12th, ...13th
return $SQWORDOKYES if $wd =~ /^\d+$/; # word is all digits
return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?1st$/i; # ...1st, ...21st, ...31st, etc
return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?2n?d$/i; # ...2nd, ...22nd, ...32nd, etc (also 2d, 22d, etc)
return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?3r?d$/i; # ...3rd, ...23rd, ...33rd, etc (also 3d, 33d, etc)
return $SQWORDOKYES if $wd =~ /^\d*[04-9]th$/i; # ...0th, ...4th, ...5th, etc
return $SQWORDOKYES if $wd =~ /^\d*1[123]th$/i; # ...11th, ...12th, ...13th

# Allow decades/years
return 1 if $wd =~ /^['$APOS]?\d\ds$/; # e.g. '20s or 20s (abbreviation for 1820s)
return 1 if $wd =~ /^['$APOS]\d\d$/; # e.g. '62 (abbreviation for 1862)
return 1 if $wd =~ /^1\d{3}s$/; # e.g. 1820s
return $SQWORDOKYES if $wd =~ /^['$APOS]?\d\ds$/; # e.g. '20s or 20s (abbreviation for 1820s)
return $SQWORDOKYES if $wd =~ /^['$APOS]\d\d$/; # e.g. '62 (abbreviation for 1862)
return $SQWORDOKYES if $wd =~ /^1\d{3}s$/; # e.g. 1820s

# Allow abbreviations for shillings and pence (not pounds because 20l is common scanno for the number 201)
return 1 if $wd =~ /^\d{1,2}[sd]$/; # e.g. 15s or 6d (up to 2 digits of old English shillings and pence)
return $SQWORDOKYES if $wd =~ /^\d{1,2}[sd]$/; # e.g. 15s or 6d (up to 2 digits of old English shillings and pence)

return 1 if $wd =~ /^sc$/i; # <sc> DP markup
return $SQWORDOKYES if $wd =~ /^sc$/i; # <sc> DP markup

return 0;
return $SQWORDOKNO;
}

#
# Return true if a word is in the dictionary, allowing swap of straight/curly apostrophes
sub spellqueryindictapos {
my $wd = shift;

# First check if word is in dictionary
return 1 if spellqueryindict($wd);
my $wordok = spellqueryindict($wd);

# Return status code if it's a bad word, in the dictionary or doesn't contain apostrophes
return $wordok
if $wordok == $SQWORDOKBAD
or $wordok == $SQWORDOKYES
or $wd !~ /['$APOS]/;

# Now try swapping straight/curly apostrophes and recheck
# Contains apostrophes - try swapping straight/curly and recheck
if ( $wd =~ /$APOS/ ) {
$wd =~ s/$APOS/'/g;
return 1 if spellqueryindict($wd);
} elsif ( $wd =~ /'/ ) {
$wd =~ s/'/$APOS/g;
return 1 if spellqueryindict($wd);
}
return 0;
return spellqueryindict($wd);
}

#
# Return true if word is in dictionary: same case, lower case, or title case (e.g. LONDON matches London)
# Return whether word is in the bad_words list, dictionary, or neither (wrong spelling)
# Check same case, lower case, or title case (e.g. LONDON matches London)
# Can't just do case-insensitive check because we don't want "london" to be OK.
sub spellqueryindict {
my $wd = shift;
return 1 if $sqglobaldict{$wd};
return 1 if $sqglobaldict{ lc $wd };
return 1
if length($wd) > 1 and $sqglobaldict{ substr( $wd, 0, 1 ) . lc substr( $wd, 1 ) };
my $wd = shift;
my $lcwd = lc $wd;
my $tcwd = length($wd) > 1 ? substr( $wd, 0, 1 ) . substr( $lcwd, 1 ) : '';
$tcwd = '' if $tcwd eq $lcwd; # No point in checking same word twice

return 0;
# First ensure bad words are always reported
return $SQWORDOKBAD if exists( $::projectbadwords{$wd} );
return $SQWORDOKBAD if exists( $::projectbadwords{$lcwd} );
return $SQWORDOKBAD if $tcwd and exists( $::projectbadwords{$tcwd} );

# Now check dictionary for good words
return $SQWORDOKYES if $sqglobaldict{$wd};
return $SQWORDOKYES if $sqglobaldict{$lcwd};
return $SQWORDOKYES if $tcwd and $sqglobaldict{$tcwd};

return $SQWORDOKNO;
}

#
Expand Down Expand Up @@ -1642,7 +1674,8 @@ sub booklouperun {
}

# Now add project dictionary words
delete $::projectdict{$_} for keys %::projectdict; # Old spellcheck code doesn't clear hash in spellloadprojectdict()
delete $::projectdict{$_} for keys %::projectdict; # Old spellcheck code doesn't clear hash in spellloadprojectdict()
delete $::projectbadwords{$_} for keys %::projectbadwords;
::spellloadprojectdict();
$sqglobaldict{$_} = 1 for keys %::projectdict;

Expand Down Expand Up @@ -1695,7 +1728,7 @@ sub booklouperun {
# Return how many times bad word referred to in error message was found during the check
sub spellqueryfrequency {
my $line = shift;
return $sqbadwordfreq{$1} if $line =~ /^\d+:\d+ +- (.+)/;
return $sqbadwordfreq{$1} if $line =~ /^\d+:\d+ +- ([^*]+)/; # Ignore asterisks marking bad words
return 0;
}

Expand Down
66 changes: 24 additions & 42 deletions src/lib/Guiguts/MultiLingual.pm
Original file line number Diff line number Diff line change
Expand Up @@ -680,60 +680,42 @@ sub includeprojectdict {
#
# Add all spelt foreign words to project dictionary
sub addspeltforeignproject {
::spellloadprojectdict();
my $i = 0;
for my $key ( sort ( keys %distinctwords ) ) {
if ( ( $seenwordslang{$key} )
&& ( $seenwordslang{$key} ne $::multidicts[0] )
&& ( $seenwordslang{$key} ne 'user' ) ) {
$::projectdict{$key} = $seenwordslang{$key};
$i++;
}
}

my $section = "\%projectdict = (\n";
for my $key ( sort keys %::projectdict ) {
$key =~ s/'/\\'/g;
$section .= "'$key' => '',\n";
}
$section .= ");";
utf8::encode($section);
open my $save, '>:bytes', $::lglobal{projectdictname};
print $save $section;
close $save;
my $line = "$i words added to project dictionary";
$multiwclistbox->delete( '0', 'end' );
$multiwclistbox->insert( 'end', $line );
$multiwclistbox->update;
updategloballists();
getwordcounts();
addcriterionproject('speltforeign');
}

#
# Add words occuring >= minfreq times to project dictionary
sub addminfreqproject {
addcriterionproject('minfreq');
}

#
# Add all words that meet criterion to project dictionary
# Input argument should be 'speltforeign' or 'minfreq'
sub addcriterionproject {
my $criterion = shift;

::spellloadprojectdict();
my $i = 0;
for my $key ( keys %distinctwords ) {
unless ( $seenwordslang{$key} ) {
if ( $distinctwords{$key} >= $minfreq ) {
$::projectdict{$key} = 'freq';
$seenwordslang{$key} = 'freq';
if ( $criterion eq 'speltforeign' ) {
if ( ( $seenwordslang{$key} )
&& ( $seenwordslang{$key} ne $::multidicts[0] )
&& ( $seenwordslang{$key} ne 'user' ) ) {
$::projectdict{$key} = $seenwordslang{$key};
$i++;
}
} elsif ( $criterion eq 'minfreq' ) {
unless ( $seenwordslang{$key} ) {
if ( $distinctwords{$key} >= $minfreq ) {
$::projectdict{$key} = 'freq';
$seenwordslang{$key} = 'freq';
$i++;
}
}
}
}

my $section = "\%projectdict = (\n";
for my $key ( sort keys %::projectdict ) {
$key =~ s/'/\\'/g;
$section .= "'$key' => '',\n";
}
$section .= ");";
utf8::encode($section);
open my $save, '>:bytes', $::lglobal{projectdictname};
print $save $section;
close $save;
::spellsaveprojdict();
my $line = "$i words added to project dictionary";
$multiwclistbox->delete( '0', 'end' );
$multiwclistbox->insert( 'end', $line );
Expand Down
Loading

0 comments on commit 39f421c

Please sign in to comment.