Skip to content

Commit

Permalink
refactor: FindMatches
Browse files Browse the repository at this point in the history
- use PrepareTMXEntry/TMXEntry to reduce a number of parameters for private FindMatches#processEntry method

Signed-off-by: Hiroshi Miura <miurahr@linux.com>
  • Loading branch information
miurahr committed Nov 5, 2024
1 parent 1e3b3ce commit ff3040d
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 99 deletions.
4 changes: 2 additions & 2 deletions config/checkstyle/suppressions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@
<suppress files="Platform\.java" checks="ConstantName" lines="61-80"/>
<!-- util/Preferences -->
<suppress files="Preferences\.java" checks="LineLength" lines="197"/>
<!-- core/stat -->
<suppress checks="(ParameterNumber|MethodLength)" files="FindMatches\.java" lines="164,350,459"/>
<!-- core/statistics, ignore private FindMatches#addNearString -->
<suppress checks="ParameterNumber" files="FindMatches\.java" lines="410-440"/>
<!-- util/xml -->
<suppress checks="(EmptyBlock|MethodLength)" files="XMLStreamReader\.java"/>
<!-- util -->
Expand Down
180 changes: 83 additions & 97 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,9 @@
import org.omegat.core.data.ExternalTMFactory;
import org.omegat.core.data.ExternalTMX;
import org.omegat.core.data.IProject;
import org.omegat.core.data.IProject.DefaultTranslationsIterator;
import org.omegat.core.data.IProject.MultipleTranslationsIterator;
import org.omegat.core.data.ITMXEntry;
import org.omegat.core.data.PrepareTMXEntry;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.FuzzyMatcher;
import org.omegat.core.matching.ISimilarityCalculator;
Expand Down Expand Up @@ -88,7 +86,8 @@ public class FindMatches {

/**
* According to gettext source code, PO fuzzy entries are created above 60%
* <a href="https://sourceforge.net/p/omegat/feature-requests/1258/">RFE#1258</a>
* <a href=
* "https://sourceforge.net/p/omegat/feature-requests/1258/">RFE#1258</a>
*/
static final int PENALTY_FOR_FUZZY = 40;
private static final int PENALTY_FOR_REMOVED = 5;
Expand Down Expand Up @@ -164,10 +163,8 @@ public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentM
public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
IStopped stop) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);

srcText = searchText;
removedText = "";

// remove part that is to be removed according to user settings.
// Rationale: it might be a big string influencing the 'editing
// distance', while it is not really part
Expand All @@ -181,56 +178,48 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
srcText = removeMatcher.replaceAll("");
removedText = removedBuffer.toString();
}

// get tokens for original string
// get tokens for original string which includes non-word tokens
strTokensStem = tokenizeStem(srcText);
strTokensNoStem = tokenizeNoStem(srcText);
strTokensAll = tokenizeAll(srcText);
/* HP: includes non - word tokens */

// travel by project entries, including orphaned
if (project.getProjectProperties().isSupportDefaultTranslations()) {
project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {
public void iterate(String source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0,
fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate,
null);
}
});
}
project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {
public void iterate(EntryKey source, TMXEntry trans) {
project.iterateByDefaultTranslations((source, trans) -> {
checkStopped(stop);
if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
if (!searchExactlyTheSame && source.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY,
false, 0, fileName, trans.creator, trans.creationDate, trans.changer,
trans.changeDate, null);
PrepareTMXEntry entry = new PrepareTMXEntry(trans);
entry.source = source;
processEntry(null, entry, fileName, NearString.MATCH_SOURCE.MEMORY, false, 0);
});
}
project.iterateByMultipleTranslations((source, trans) -> {
checkStopped(stop);
if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
PrepareTMXEntry entry = new PrepareTMXEntry(trans);
entry.source = source.sourceText;
processEntry(source, entry, fileName, NearString.MATCH_SOURCE.MEMORY, false, 0);
});

/*
* Penalty applied for fuzzy matches in another language (if no match in
* the target language was found).
*/
int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES,
Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT);

// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Expand All @@ -248,28 +237,28 @@ public void iterate(EntryKey source, TMXEntry trans) {
if (requiresTranslation && tmen.getTranslationText() == null) {
continue;
}

int tmenPenalty = penalty;
if (tmen.hasPropValue(ExternalTMFactory.TMXLoader.PROP_FOREIGN_MATCH, "true")) {
tmenPenalty += foreignPenalty;
}

processEntry(null, tmen.getSourceText(), tmen.getTranslationText(),
NearString.MATCH_SOURCE.TM, false, tmenPenalty, en.getKey(), tmen.getCreator(),
tmen.getCreationDate(), tmen.getChanger(), tmen.getChangeDate(), tmen.getProperties());
processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty);
}
}

// travel by all entries for check source file translations
for (SourceTextEntry ste : project.getAllEntries()) {
checkStopped(stop);
if (ste.getSourceTranslation() != null) {
processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(),
NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file,
"", 0, "", 0, null);
PrepareTMXEntry entry = new PrepareTMXEntry();
entry.source = ste.getSrcText();
entry.translation = ste.getSourceTranslation();
entry.creator = "";
entry.changer = "";
entry.creationDate = 0;
entry.changeDate = 0;
processEntry(ste.getKey(), entry, ste.getKey().file, NearString.MATCH_SOURCE.MEMORY,
ste.isSourceTranslationFuzzy(), 0);
}
}

if (separateSegmentMatcher != null) {
// split paragraph even when segmentation disabled, then find
// matches for every segment
Expand All @@ -279,13 +268,10 @@ public void iterate(EntryKey source, TMXEntry trans) {
Language targetLang = project.getProjectProperties().getTargetLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
if (segments.size() > 1) {
int size = segments.size();
List<String> fsrc = new ArrayList<>(size);
List<String> ftrans = new ArrayList<>(size);
List<String> fsrc = new ArrayList<>(segments.size());
List<String> ftrans = new ArrayList<>(segments.size());
// multiple segments
for (short i = 0; i < size; i++) {
String onesrc = segments.get(i);

for (String onesrc : segments) {
// find match for a separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
false, stop);
Expand All @@ -298,25 +284,23 @@ public void iterate(EntryKey source, TMXEntry trans) {
ftrans.add("");
}
}
// glue found sources
String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
// glue found translations
String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
0, null);
// glue found sources and translations
PrepareTMXEntry entry = new PrepareTMXEntry();
entry.source = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
entry.translation = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
entry.creator = "";
entry.changer = "";
entry.creationDate = 0;
entry.changeDate = 0;
processEntry(null, entry, "", NearString.MATCH_SOURCE.TM, false, 0);
}
}

// fill similarity data only for a result
if (fillSimilarityData) {
// fill similarity data only for a result
for (NearString near : result) {
// fix for bug 1586397
byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll,
tokenizeAll(near.source));
near.attr = similarityData;
near.attr = FuzzyMatcher.buildSimilarityData(strTokensAll, tokenizeAll(near.source));
}
}

return result;
}

Expand All @@ -325,10 +309,8 @@ public void iterate(EntryKey source, TMXEntry trans) {
*
* @param key
* entry to compare
* @param source
* source text
* @param translation
* translation text
* @param entry
* PrepareTMXEntry entry to process.
* @param comesFrom
* match source
* @param fuzzy
Expand All @@ -337,22 +319,11 @@ public void iterate(EntryKey source, TMXEntry trans) {
* penalty score
* @param tmxName
* tmx name
* @param creator
* translation creator
* @param creationDate
* creation date of translation
* @param changer
* last editor name
* @param changedDate
* last change date
* @param props
* TMX properties
*/
private void processEntry(EntryKey key, String source, String translation, NearString.MATCH_SOURCE comesFrom,
boolean fuzzy, int penalty, String tmxName, String creator, long creationDate,
String changer, long changedDate, List<TMXProp> props) {
public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
NearString.MATCH_SOURCE comesFrom, boolean fuzzy, int penalty) {
// remove part that is to be removed prior to tokenize
String realSource = source;
String realSource = entry.getSourceText();
int realPenaltyForRemoved = 0;
if (removePattern != null) {
StringBuilder entryRemovedText = new StringBuilder();
Expand Down Expand Up @@ -422,13 +393,12 @@ private void processEntry(EntryKey key, String source, String translation, NearS
return;
}

addNearString(key, source, translation, comesFrom, fuzzy, similarityStem, similarityNoStem,
simAdjusted, tmxName, creator, creationDate, changer, changedDate, props);
addNearString(key, entry, comesFrom, fuzzy, similarityStem, similarityNoStem, simAdjusted, tmxName);
}

/**
* Check if entries have a chance to be added to a result list.
* If true, there is no sense to calculate other parameters.
* Check if entries have a chance to be added to a result list. If true,
* there is no sense to calculate other parameters.
*
* @param simStem
* similarity with stemming
Expand All @@ -454,25 +424,29 @@ private boolean noChanceToAdd(int simStem, int simNoStem, int simExactly) {
}

/**
* Add near string into the result list. Near strings sorted by
* "similarity, simAdjusted"
* Add near string into the result list. Near strings sorted by "similarity,
* simAdjusted"
*/
private void addNearString(EntryKey key, String source, String translation, NearString.MATCH_SOURCE comesFrom,
boolean fuzzy, int similarity, int similarityNoStem, int simAdjusted, String tmxName,
String creator, long creationDate, String changer, long changedDate,
List<TMXProp> tuProperties) {
private void addNearString(EntryKey key, ITMXEntry entry, NearString.MATCH_SOURCE comesFrom, boolean fuzzy,
int similarity, int similarityNoStem, int simAdjusted, String tmxName) {
final String source = entry.getSourceText();
final String translation = entry.getTranslationText();
final String creator = entry.getCreator();
final long creationDate = entry.getCreationDate();
final String changer = entry.getChanger();
final long changedDate = entry.getChangeDate();
final List<TMXProp> tuProperties = entry.getProperties();
// find position for new data
int pos = 0;
for (int i = 0; i < result.size(); i++) {
NearString st = result.get(i);
if (source.equals(st.source) && Objects.equals(translation, st.translation)) {
// Consolidate identical matches from different sources into a
// single NearString with
// multiple project entries.
// single NearString with multiple project entries.
result.set(i,
NearString.merge(st, key, source, translation, comesFrom, fuzzy, similarity,
similarityNoStem, simAdjusted, null, tmxName, creator, creationDate,
changer, changedDate, tuProperties));
similarityNoStem, simAdjusted, null, tmxName, creator, creationDate, changer,
changedDate, tuProperties));
return;
}
if (st.scores[0].score < similarity) {
Expand Down Expand Up @@ -552,10 +526,22 @@ private void checkStopped(IStopped stop) throws StoppedException {
}

/**
* Process will throw this exception if it stopped.All callers must catch it
* and just skip.
* The Process will throw this exception if it stopped. All callers must
* catch it and just skip.
*/
@SuppressWarnings("serial")
public static class StoppedException extends RuntimeException {
}

static class Similarity {
int similarity;
int similarityNoStem;
int simAdjusted;

Similarity(int sim, int simNoStem, int simAdj) {
similarity = sim;
similarityNoStem = simNoStem;
simAdjusted = simAdj;
}
}
}

0 comments on commit ff3040d

Please sign in to comment.