Skip to content
This repository has been archived by the owner on Jun 16, 2018. It is now read-only.

Commit

Permalink
wiktionary_article_to_xml_1: Rudimentary start of parsing the transla…
Browse files Browse the repository at this point in the history
…tion table.
  • Loading branch information
skreutzer committed Aug 2, 2017
1 parent 3b15a5c commit 94acc70
Show file tree
Hide file tree
Showing 4 changed files with 280 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,11 @@ public StringBuilder word()
sbDefinitions.append(sbDefinition.toString());
}
}
/*
else if (token.equals("=="))
{
this.tokenCursor -= 1;
break;
}
*/
}

if (sbDefinitions.length() > 0)
Expand Down Expand Up @@ -193,19 +191,41 @@ public StringBuilder definition()
return null;
}
}
/*
else if (token.equals("Übersetzungen"))
{
++this.tokenCursor;
}
*/

if (match(" ") != true ||
match("===") != true)
{
return null;
}

StringBuilder sbTranslations = new StringBuilder();

while (this.tokenCursor < this.tokens.size())
{
token = nextToken();

if (token.equals("===="))
{
StringBuilder sbTranslation = translations();

if (sbTranslation != null)
{
sbTranslations.append(sbTranslation.toString());
}
}
else if (token.equals("==") == true ||
token.equals("===") == true)
{
this.tokenCursor -= 1;
break;
}
}

if (sbTranslations.length() > 0)
{
sbResult.append(sbTranslations.toString());
}

sbResult.append("</definition>");

return sbResult;
Expand Down Expand Up @@ -336,6 +356,191 @@ else if (token.equals("n"))
return null;
}

public StringBuilder translations()
{
StringBuilder sbResult = new StringBuilder();

sbResult.append("<translations>");

if (match(" ") != true ||
match("{{") != true ||
match("Übersetzungen") != true ||
match("}}") != true ||
match(" ") != true ||
match("====") != true ||
match("\n") != true ||
match("{{") != true ||
match("Ü") != true ||
match("-") != true ||
match("Tabelle") != true ||
match("|") != true ||
match("Ü") != true ||
match("-") != true ||
match("links") != true ||
match("=") != true ||
match("\n") != true)
{
return null;
}

StringBuilder sbTranslations = new StringBuilder();

while (this.tokenCursor < this.tokens.size())
{
String token = nextToken();

if (token.equals("*"))
{
StringBuilder sbTranslation = translation();

if (sbTranslation != null)
{
sbTranslations.append(sbTranslation.toString());
}
}
else if (token.equals("==") == true ||
token.equals("===") == true ||
token.equals("====") == true)
{
this.tokenCursor -= 1;
break;
}
}

if (sbTranslations.length() > 0)
{
sbResult.append(sbTranslations.toString());
}

sbResult.append("</translations>");

return sbResult;
}

public StringBuilder translation()
{
StringBuilder sbResult = new StringBuilder();

sbResult.append("<translation>");

if (match("{{") != true)
{
return null;
}

String language = nextToken();

if (language.equals("ar") == true)
{
sbResult.append("<language-code>" + language + "</language-code>");
}
else
{
this.infoMessages.add(constructInfoMessage("messageParsingError", true, null, null, "ar", language));
return null;
}

while (this.tokenCursor < this.tokens.size())
{
String token = nextToken();

if (token.equals("*") == true)
{
this.infoMessages.add(constructInfoMessage("messageParsingOfSequenceAborted", true, null, null, "translation", token));
return null;
}

if (token.equals("}}") != true)
{
// Ignore/consume.
}
else
{
break;
}
}

if (match(":") != true)
{
return null;
}

while (this.tokenCursor < this.tokens.size())
{
String token = nextToken();

if (token.equals("*") == true)
{
this.infoMessages.add(constructInfoMessage("messageParsingOfSequenceAborted", true, null, null, "translation", token));
return null;
}

if (token.equals("[") != true)
{
// Ignore/consume.
}
else
{
break;
}
}

nextToken();

if (match("]") != true ||
match(" ") != true ||
match("{{") != true ||
match("Üxx") != true ||
match("4") != true ||
match("|") != true ||
match("ar") != true ||
match("|") != true)
{
return null;
}

StringBuilder sbDisplay = new StringBuilder();

while (this.tokenCursor < this.tokens.size())
{
String token = nextToken();

if (token.equals("|") == true)
{
break;
}

sbDisplay.append(token);
}

sbResult.append("<display>" + sbDisplay.toString() + "</display>");

if (match("v") != true ||
match("=") != true)
{
return null;
}

StringBuilder sbVocalization = new StringBuilder();

while (this.tokenCursor < this.tokens.size())
{
String token = nextToken();

if (token.equals("|") == true)
{
break;
}

sbVocalization.append(token);
}

sbResult.append("<vocalization>" + sbVocalization.toString() + "</vocalization>");
sbResult.append("</translation>");

return sbResult;
}

public boolean match(String required)
{
String token = nextToken();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (C) 2015-2017 Stephan Kreutzer
#
# This file is part of wiktionary_article_to_xml_1, a submodule of the
# wiktionary_dump_to_xml_1 package.
#
# wiktionary_article_to_xml_1 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License version 3 or any later version,
# as published by the Free Software Foundation.
#
# wiktionary_article_to_xml_1 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License 3 for more details.
#
# You should have received a copy of the GNU Affero General Public License 3
# along with wiktionary_article_to_xml_1. If not, see <http://www.gnu.org/licenses/>.

messageArgumentsMissingUsage = Verwendung:
messageParameterList = job-datei ergebnis-info-datei
messageResultInfoFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Ergebnisinformationsdatei "{0}" ermitteln.
messageResultInfoPathIsntAFile = Ergebnisinformationspfad "{0}" existiert bereits, referenziert aber keine Datei.
messageResultInfoFileIsntWritable = Ergebnisinformationsdatei "{0}" ist nicht schreibbar.
messageJobFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Jobdatei "{0}" ermitteln.
messageJobFileDoesntExist = Jobdatei "{0}" existiert nicht.
messageJobPathIsntAFile = Jobpfad "{0}" ist keine Datei.
messageJobFileIsntReadable = Jobdatei "{0}" ist nicht lesbar.
messageCallDetails = Aufgerufen mit Jobdatei "{0}" und Ergebnisinformationsdatei "{1}".
messageJobFileEntryIsMissingAnAttribute = Element "{1} in Jobdatei "{0}" fehlt das "{2}"-Attribut.
messageJobFileElementConfiguredMoreThanOnce = Element "{1}" mehr als einmal konfiguriert in Jobdatei "{0}".
messageInputFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Eingabedatei "{0}" wie in Jobdatei "{1}" hinterlegt ermitteln.
messageInputFileDoesntExist = Die Eingabe-Datei "{1}" laut Job-Datei "{0}" existiert nicht.
messageInputPathIsntAFile = Der Eingabe-Pfad "{1}" laut Job-Datei "{0}" referenziert keine Datei.
messageInputFileIsntReadable = Die Eingabe-Datei "{1}" laut Job-Datei "{0}" ist nicht lesbar.
messageOutputFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Ausgabedatei "{0}" wie in Jobdatei "{1}" hinterlegt ermitteln.
messageOutputPathIsntAFile = Ausgabepfad "{0}" wie in Jobdatei "{1}" hinterlegt existiert, ist aber keine Datei.
messageOutputFileIsntWritable = Ausgabedatei "{0}" wie in Jobdatei "{1}" hinterlegt existiert, ist aber nicht überschreibbar.
messageJobFileErrorWhileReading = Während die Jobdatei "{0}" ausgelesen wurde, ist ein Fehler aufgetreten.
messageJobFileInputFileIsntConfigured = Keine Eingabedatei mittels Element "{1}" in Jobdatei "{0}" konfiguriert.
messageJobFileOutputFileIsntConfigured = Keine Ausgabedatei mittels Element "{1}" in Jobdatei "{0}" konfiguriert.
messageTokenizerCharacterIsRepeating = Tokenizer: Zeichen ''{0}'' wiederholt sich.
messageTokenizerInvalidCharacter = Tokenizer: Ungültiges Zeichen ''{0}'' ({1}).
messageTokenizerUnknownCharacter = Tokenizer: Unbekanntes Zeichen ''{0}'' ({1}).
messageTokenizerErrorWhileTokenizing = Tokenizer: Es ist ein Fehler während des Tokenisierens aufgetreten.
messageParserErrorWhileParsing = Parser: Es ist ein Fehler während des Parsens aufgetreten.
messageParserFailed = Die Eingabedatei "{0}" konnte nicht erfolgreich geparst werden.
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@

messageParserNoMoreTokens = Parser: Run out of tokens while more tokens were expected.
messageParsingError = Parsing error: "{0}" expected, found "{1}".
messageParsingOfSequenceAborted = Parsing of sequence "{0}" aborted by termination "{1}".
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (C) 2017 Stephan Kreutzer
#
# This file is part of wiktionary_article_to_xml_1, a submodule of the
# wiktionary_dump_to_xml_1 package.
#
# wiktionary_article_to_xml_1 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License version 3 or any later version,
# as published by the Free Software Foundation.
#
# wiktionary_article_to_xml_1 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License 3 for more details.
#
# You should have received a copy of the GNU Affero General Public License 3
# along with wiktionary_article_to_xml_1. If not, see <http://www.gnu.org/licenses/>.


messageParserNoMoreTokens = Parser: Keine weiteren Tokens vorgefunden, obwohl weitere erwartet werden.
messageParsingError = Parsing-Fehler: "{0}" erwartet, "{1}" vorgefunden.
messageParsingOfSequenceAborted = Parsing der Sequenz "{0}" abgebrochen durch Termination "{1}".

0 comments on commit 94acc70

Please sign in to comment.