Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load bad words when loading good words in SQ #1254

Merged
merged 2 commits into from
Sep 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/guiguts.pl
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@
our %charsuiteenabled = ( 'Basic Latin' => 1 ); # All projects allow Basic Latin character suite
our %pagenumbers;
our %projectdict;
our %projectbadwords;
our %reghints = ();
our %scannoslist;
our %geometryhash; #Geometry of some windows in one hash.
Expand Down
148 changes: 103 additions & 45 deletions src/lib/Guiguts/ErrorCheck.pm
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ sub errorcheckpop_up {
-width => 16
)->grid( -padx => 10, -row => 0, -column => $gcol++ );

# Spell Query bad word count sits under the Run Checks button
if ( $errorchecktype eq 'Spell Query' ) {
$gcol--;
$::lglobal{sqbadwordcountlbl} =
$ptopframeb->Label()->grid( -padx => 0, -row => 1, -column => $gcol++ );
}

$ptopframeb->Button(
-command => sub {
errorcheckcopy();
Expand Down Expand Up @@ -120,7 +127,7 @@ sub errorcheckpop_up {
spellquerycleardict(); # Clear cache, so project dictionary will be reloaded below
errorcheckpop_up( $textwindow, $top, $errorchecktype ); # Rerun Spell Query
},
-text => 'Add good_words.txt',
-text => 'Add good/bad words',
-width => 16
)->grid( -row => 1, -column => $gcol - 1 );
my $btnskip = $ptopframeb->Button(
Expand Down Expand Up @@ -564,7 +571,11 @@ sub errorcheckpop_up {
} elsif ( $errorchecktype eq "Spell Query" ) { # Spell query also has a frequency count to append
my $freq = spellqueryfrequency($line);
next if $freq > $::spellquerythreshold; # If it's spelled the same way several times, it's probably not an error
$line .= " ($freq)";
if ( $line =~ s/\*\*\*$// ) { # Spelling flagged as bad word
$line .= " *$freq*";
} else {
$line .= " ($freq)"; # Ordinary misspelling
}

} elsif ( $errorchecktype eq "EPUBCheck" ) {

Expand Down Expand Up @@ -645,6 +656,12 @@ sub errorcheckpop_up {
} else {
errsortrefresh($errorchecktype);
eccountupdate($countqueries);
if ( $errorchecktype eq "Spell Query" ) {
my $numbadwords = sqgetnumbadwords();
my $numbadwordslbl = $numbadwords > 0 ? "*$numbadwords* bad word" : "";
$numbadwordslbl .= "s" if $numbadwords > 1;
$::lglobal{sqbadwordcountlbl}->configure( -text => $numbadwordslbl );
}
}

$::lglobal{errorchecklistbox}->update;
Expand Down Expand Up @@ -1477,8 +1494,14 @@ sub booklouperun {
#
# Block to make Spell Query dictionary hash local & persistent
{
my %sqglobaldict = ();
my %sqbadwordfreq = ();
my %sqglobaldict = ();
my %sqbadspellingfreq = ();
my %sqbadwordcount = ();

# Codes returned by spellquerywordok()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Love the use of these named "constants" -- very easy to understand what's going on 👍

my $SQWORDOKYES = 0; # Word in dictionary, or meets other criteria for being OK
my $SQWORDOKNO = 1; # Typically a spelling error
my $SQWORDOKBAD = 2; # Actual "bad word" - maybe an OK word, but bad in this project - needs marking specially

#
# Run Spell Query on whole file
Expand Down Expand Up @@ -1511,21 +1534,28 @@ sub booklouperun {
$nextcol = $col + length($wd) + 1; # Step forward word length plus single space

next unless $wd; # Empty word if two consecutive separators, e.g. period & space
next if spellquerywordok($wd);
my $wordok = spellquerywordok($wd);
next if $wordok == $SQWORDOKYES;

# If word has leading straight apostrophe, it might be open single quote; trim it and check again
if ( $wd =~ s/^'// ) {
++$col; # Allow for having removed apostrophe from start of word
next if spellquerywordok($wd);
$wordok = spellquerywordok($wd);
next if $wordok == $SQWORDOKYES;
}

# if trailing straight/curly apostrophe, it might be close single quote; trim it and check again
next if $wd =~ s/['$APOS]$//g and spellquerywordok($wd);
next if $wd =~ /^['$APOS]*$/; # OK if nothing left in string but zero or more quotes
if ( $wd =~ s/['$APOS]$//g ) {
$wordok = spellquerywordok($wd);
next if $wordok == $SQWORDOKYES;
}
next if $wd =~ /^['$APOS]*$/; # OK if nothing left in string but zero or more quotes

# Format message - increment word frequency; final total gets appended later when populating dialog
$sqbadwordfreq{$wd}++;
my $error = sprintf( "%d:%-2d - %s", $step, $col, $wd );
$sqbadspellingfreq{$wd}++;
my $badwd = $wordok == $SQWORDOKBAD ? "***" : ""; # Flag bad word to higher routine with asterisks
$sqbadwordcount{$wd}++ if $badwd;
my $error = sprintf( "%d:%-2d - %s%s", $step, $col, $wd, $badwd );
utf8::encode($error);
print $logfile "$error\n";
}
Expand All @@ -1541,77 +1571,94 @@ sub booklouperun {
my $wfword = shift;

for my $wd ( split( /\W/, $wfword ) ) {
return 0 unless spellquerywordok($wd); # part not found
return 0 unless spellquerywordok($wd) == $SQWORDOKYES; # part not found
}
return 1; # all parts ok
return 1; # all parts ok
}

#
# Return true if word is OK, e.g. in dictionary or meets some other criterion
sub spellquerywordok {
my $wd = shift;

# First check if word is in dictionary
return 1 if spellqueryindictapos($wd);
# Return status code if whole thing is a bad word or in the dictionary
my $wordok = spellqueryindictapos($wd);
return $wordok
if $wordok == $SQWORDOKBAD
or $wordok == $SQWORDOKYES;

# Some languages use l', quest', etc., before word - accept if the "prefix" and the main word are both good
# Prefix can be with or without apostrophe ("with" is safer to avoid prefix being accepted if standalone word)
return 1
return $SQWORDOKYES
if $wd =~ /^(\w+)['$APOS](\w+)/
and ( spellqueryindictapos($1) or spellqueryindictapos( $1 . "'" ) )
and spellqueryindictapos($2);
and (spellqueryindictapos($1) == $SQWORDOKYES
or spellqueryindictapos( $1 . "'" ) == $SQWORDOKYES )
and spellqueryindictapos($2) == $SQWORDOKYES;

# Now check numbers
return 1 if $wd =~ /^\d+$/; # word is all digits
return 1 if $wd =~ /^(\d*[02-9])?1st$/i; # ...1st, ...21st, ...31st, etc
return 1 if $wd =~ /^(\d*[02-9])?2n?d$/i; # ...2nd, ...22nd, ...32nd, etc (also 2d, 22d, etc)
return 1 if $wd =~ /^(\d*[02-9])?3r?d$/i; # ...3rd, ...23rd, ...33rd, etc (also 3d, 33d, etc)
return 1 if $wd =~ /^\d*[04-9]th$/i; # ...0th, ...4th, ...5th, etc
return 1 if $wd =~ /^\d*1[123]th$/i; # ...11th, ...12th, ...13th
return $SQWORDOKYES if $wd =~ /^\d+$/; # word is all digits
return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?1st$/i; # ...1st, ...21st, ...31st, etc
return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?2n?d$/i; # ...2nd, ...22nd, ...32nd, etc (also 2d, 22d, etc)
return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?3r?d$/i; # ...3rd, ...23rd, ...33rd, etc (also 3d, 33d, etc)
return $SQWORDOKYES if $wd =~ /^\d*[04-9]th$/i; # ...0th, ...4th, ...5th, etc
return $SQWORDOKYES if $wd =~ /^\d*1[123]th$/i; # ...11th, ...12th, ...13th

# Allow decades/years
return 1 if $wd =~ /^['$APOS]?\d\ds$/; # e.g. '20s or 20s (abbreviation for 1820s)
return 1 if $wd =~ /^['$APOS]\d\d$/; # e.g. '62 (abbreviation for 1862)
return 1 if $wd =~ /^1\d{3}s$/; # e.g. 1820s
return $SQWORDOKYES if $wd =~ /^['$APOS]?\d\ds$/; # e.g. '20s or 20s (abbreviation for 1820s)
return $SQWORDOKYES if $wd =~ /^['$APOS]\d\d$/; # e.g. '62 (abbreviation for 1862)
return $SQWORDOKYES if $wd =~ /^1\d{3}s$/; # e.g. 1820s

# Allow abbreviations for shillings and pence (not pounds because 20l is common scanno for the number 201)
return 1 if $wd =~ /^\d{1,2}[sd]$/; # e.g. 15s or 6d (up to 2 digits of old English shillings and pence)
return $SQWORDOKYES if $wd =~ /^\d{1,2}[sd]$/; # e.g. 15s or 6d (up to 2 digits of old English shillings and pence)

return 1 if $wd =~ /^sc$/i; # <sc> DP markup
return $SQWORDOKYES if $wd =~ /^sc$/i; # <sc> DP markup

return 0;
return $SQWORDOKNO;
}

#
# Return true if a word is in the dictionary, allowing swap of straight/curly apostrophes
sub spellqueryindictapos {
my $wd = shift;

# First check if word is in dictionary
return 1 if spellqueryindict($wd);
my $wordok = spellqueryindict($wd);

# Now try swapping straight/curly apostrophes and recheck
# Return status code if it's a bad word, in the dictionary or doesn't contain apostrophes
return $wordok
if $wordok == $SQWORDOKBAD
or $wordok == $SQWORDOKYES
or $wd !~ /['$APOS]/;

# Contains apostrophes - try swapping straight/curly and recheck
if ( $wd =~ /$APOS/ ) {
$wd =~ s/$APOS/'/g;
return 1 if spellqueryindict($wd);
} elsif ( $wd =~ /'/ ) {
$wd =~ s/'/$APOS/g;
return 1 if spellqueryindict($wd);
}
return 0;
return spellqueryindict($wd);
}

#
# Return true if word is in dictionary: same case, lower case, or title case (e.g. LONDON matches London)
# Return whether word is in the bad_words list, dictionary, or neither (wrong spelling)
# Check same case, lower case, or title case (e.g. LONDON matches London)
# Can't just do case-insensitive check because we don't want "london" to be OK.
sub spellqueryindict {
my $wd = shift;
return 1 if $sqglobaldict{$wd};
return 1 if $sqglobaldict{ lc $wd };
return 1
if length($wd) > 1 and $sqglobaldict{ substr( $wd, 0, 1 ) . lc substr( $wd, 1 ) };
my $wd = shift;
my $lcwd = lc $wd;
my $tcwd = length($wd) > 1 ? substr( $wd, 0, 1 ) . substr( $lcwd, 1 ) : '';
$tcwd = '' if $tcwd eq $lcwd; # No point in checking same word twice

return 0;
# First ensure bad words are always reported
return $SQWORDOKBAD if exists( $::projectbadwords{$wd} );
return $SQWORDOKBAD if exists( $::projectbadwords{$lcwd} );
return $SQWORDOKBAD if $tcwd and exists( $::projectbadwords{$tcwd} );

# Now check dictionary for good words
return $SQWORDOKYES if $sqglobaldict{$wd};
return $SQWORDOKYES if $sqglobaldict{$lcwd};
return $SQWORDOKYES if $tcwd and $sqglobaldict{$tcwd};

return $SQWORDOKNO;
}

#
Expand Down Expand Up @@ -1642,7 +1689,8 @@ sub booklouperun {
}

# Now add project dictionary words
delete $::projectdict{$_} for keys %::projectdict; # Old spellcheck code doesn't clear hash in spellloadprojectdict()
delete $::projectdict{$_} for keys %::projectdict; # Old spellcheck code doesn't clear hash in spellloadprojectdict()
delete $::projectbadwords{$_} for keys %::projectbadwords;
::spellloadprojectdict();
$sqglobaldict{$_} = 1 for keys %::projectdict;

Expand Down Expand Up @@ -1695,7 +1743,7 @@ sub booklouperun {
# Return how many times bad word referred to in error message was found during the check
sub spellqueryfrequency {
my $line = shift;
return $sqbadwordfreq{$1} if $line =~ /^\d+:\d+ +- (.+)/;
return $sqbadspellingfreq{$1} if $line =~ /^\d+:\d+ +- ([^*]+)/; # Ignore asterisks marking bad words
return 0;
}

Expand All @@ -1708,9 +1756,19 @@ sub booklouperun {
#
# Clear the spell query frequency counts
sub spellqueryclearcounts {
delete $sqbadwordfreq{$_} for keys %sqbadwordfreq;
delete $sqbadspellingfreq{$_} for keys %sqbadspellingfreq;
delete $sqbadwordcount{$_} for keys %sqbadwordcount;
}

#
# Get number of bad words used in file
sub sqgetnumbadwords {
my $numbadwords = 0;
for my $key ( keys %sqbadwordcount ) {
$numbadwords += $sqbadwordcount{$key};
}
return $numbadwords;
}
} # end of variable-enclosing block

#
Expand Down
66 changes: 24 additions & 42 deletions src/lib/Guiguts/MultiLingual.pm
Original file line number Diff line number Diff line change
Expand Up @@ -680,60 +680,42 @@ sub includeprojectdict {
#
# Add all spelt foreign words to project dictionary
sub addspeltforeignproject {
::spellloadprojectdict();
my $i = 0;
for my $key ( sort ( keys %distinctwords ) ) {
if ( ( $seenwordslang{$key} )
&& ( $seenwordslang{$key} ne $::multidicts[0] )
&& ( $seenwordslang{$key} ne 'user' ) ) {
$::projectdict{$key} = $seenwordslang{$key};
$i++;
}
}

my $section = "\%projectdict = (\n";
for my $key ( sort keys %::projectdict ) {
$key =~ s/'/\\'/g;
$section .= "'$key' => '',\n";
}
$section .= ");";
utf8::encode($section);
open my $save, '>:bytes', $::lglobal{projectdictname};
print $save $section;
close $save;
my $line = "$i words added to project dictionary";
$multiwclistbox->delete( '0', 'end' );
$multiwclistbox->insert( 'end', $line );
$multiwclistbox->update;
updategloballists();
getwordcounts();
addcriterionproject('speltforeign');
}

#
# Add words occuring >= minfreq times to project dictionary
sub addminfreqproject {
addcriterionproject('minfreq');
}

#
# Add all words that meet criterion to project dictionary
# Input argument should be 'speltforeign' or 'minfreq'
sub addcriterionproject {
my $criterion = shift;

::spellloadprojectdict();
my $i = 0;
for my $key ( keys %distinctwords ) {
unless ( $seenwordslang{$key} ) {
if ( $distinctwords{$key} >= $minfreq ) {
$::projectdict{$key} = 'freq';
$seenwordslang{$key} = 'freq';
if ( $criterion eq 'speltforeign' ) {
if ( ( $seenwordslang{$key} )
&& ( $seenwordslang{$key} ne $::multidicts[0] )
&& ( $seenwordslang{$key} ne 'user' ) ) {
$::projectdict{$key} = $seenwordslang{$key};
$i++;
}
} elsif ( $criterion eq 'minfreq' ) {
unless ( $seenwordslang{$key} ) {
if ( $distinctwords{$key} >= $minfreq ) {
$::projectdict{$key} = 'freq';
$seenwordslang{$key} = 'freq';
$i++;
}
}
}
}

my $section = "\%projectdict = (\n";
for my $key ( sort keys %::projectdict ) {
$key =~ s/'/\\'/g;
$section .= "'$key' => '',\n";
}
$section .= ");";
utf8::encode($section);
open my $save, '>:bytes', $::lglobal{projectdictname};
print $save $section;
close $save;
::spellsaveprojdict();
my $line = "$i words added to project dictionary";
$multiwclistbox->delete( '0', 'end' );
$multiwclistbox->insert( 'end', $line );
Expand Down
Loading
Loading