wiktionary_article_to_xml_1: Rudimentary start of parsing the transla…

…tion table.
publishing-systems · Aug 2, 2017 · 94acc70 · 94acc70
1 parent 3b15a5c
commit 94acc70
Show file tree

Hide file tree

Showing 4 changed files with 280 additions and 8 deletions.
diff --git a/org.wiktionary/wiktionary_dump_to_xml_1/wiktionary_article_to_xml_1/ParserGerman.java b/org.wiktionary/wiktionary_dump_to_xml_1/wiktionary_article_to_xml_1/ParserGerman.java
@@ -123,13 +123,11 @@ public StringBuilder word()
                     sbDefinitions.append(sbDefinition.toString());
                 }
             }
-            /*
             else if (token.equals("=="))
             {
                 this.tokenCursor -= 1;
                 break;
             }
-            */
         }
 
         if (sbDefinitions.length() > 0)
@@ -193,19 +191,41 @@ public StringBuilder definition()
                 return null;
             }
         }
-        /*
-        else if (token.equals("Übersetzungen"))
-        {
-            ++this.tokenCursor;
-        }
-        */
 
         if (match(" ") != true ||
             match("===") != true)
         {
             return null;
         }
 
+        StringBuilder sbTranslations = new StringBuilder();
+
+        while (this.tokenCursor < this.tokens.size())
+        {
+            token = nextToken();
+
+            if (token.equals("===="))
+            {
+                StringBuilder sbTranslation = translations();
+
+                if (sbTranslation != null)
+                {
+                    sbTranslations.append(sbTranslation.toString());
+                }
+            }
+            else if (token.equals("==") == true ||
+                     token.equals("===") == true)
+            {
+                this.tokenCursor -= 1;
+                break;
+            }
+        }
+
+        if (sbTranslations.length() > 0)
+        {
+            sbResult.append(sbTranslations.toString());
+        }
+
         sbResult.append("</definition>");
 
         return sbResult;
@@ -336,6 +356,191 @@ else if (token.equals("n"))
         return null;
     }
 
+    public StringBuilder translations()
+    {
+        StringBuilder sbResult = new StringBuilder();
+
+        sbResult.append("<translations>");
+
+        if (match(" ") != true ||
+            match("{{") != true ||
+            match("Übersetzungen") != true ||
+            match("}}") != true ||
+            match(" ") != true ||
+            match("====") != true ||
+            match("\n") != true ||
+            match("{{") != true ||
+            match("Ü") != true ||
+            match("-") != true ||
+            match("Tabelle") != true ||
+            match("|") != true ||
+            match("Ü") != true ||
+            match("-") != true ||
+            match("links") != true ||
+            match("=") != true ||
+            match("\n") != true)
+        {
+            return null;
+        }
+
+        StringBuilder sbTranslations = new StringBuilder();
+
+        while (this.tokenCursor < this.tokens.size())
+        {
+            String token = nextToken();
+
+            if (token.equals("*"))
+            {
+                StringBuilder sbTranslation = translation();
+
+                if (sbTranslation != null)
+                {
+                    sbTranslations.append(sbTranslation.toString());
+                }
+            }
+            else if (token.equals("==") == true ||
+                     token.equals("===") == true ||
+                     token.equals("====") == true)
+            {
+                this.tokenCursor -= 1;
+                break;
+            }
+        }
+
+        if (sbTranslations.length() > 0)
+        {
+            sbResult.append(sbTranslations.toString());
+        }
+
+        sbResult.append("</translations>");
+
+        return sbResult;
+    }
+
+    public StringBuilder translation()
+    {
+        StringBuilder sbResult = new StringBuilder();
+
+        sbResult.append("<translation>");
+
+        if (match("{{") != true)
+        {
+            return null;
+        }
+
+        String language = nextToken();
+
+        if (language.equals("ar") == true)
+        {
+            sbResult.append("<language-code>" + language + "</language-code>");
+        }
+        else
+        {
+            this.infoMessages.add(constructInfoMessage("messageParsingError", true, null, null, "ar", language));
+            return null;
+        }
+
+        while (this.tokenCursor < this.tokens.size())
+        {
+            String token = nextToken();
+
+            if (token.equals("*") == true)
+            {
+                this.infoMessages.add(constructInfoMessage("messageParsingOfSequenceAborted", true, null, null, "translation", token));
+                return null;
+            }
+
+            if (token.equals("}}") != true)
+            {
+                // Ignore/consume.
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        if (match(":") != true)
+        {
+            return null;
+        }
+
+        while (this.tokenCursor < this.tokens.size())
+        {
+            String token = nextToken();
+
+            if (token.equals("*") == true)
+            {
+                this.infoMessages.add(constructInfoMessage("messageParsingOfSequenceAborted", true, null, null, "translation", token));
+                return null;
+            }
+
+            if (token.equals("[") != true)
+            {
+                // Ignore/consume.
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        nextToken();
+
+        if (match("]") != true ||
+            match(" ") != true ||
+            match("{{") != true ||
+            match("Üxx") != true ||
+            match("4") != true ||
+            match("|") != true ||
+            match("ar") != true ||
+            match("|") != true)
+        {
+            return null;
+        }
+
+        StringBuilder sbDisplay = new StringBuilder();
+
+        while (this.tokenCursor < this.tokens.size())
+        {
+            String token = nextToken();
+
+            if (token.equals("|") == true)
+            {
+                break;
+            }
+
+            sbDisplay.append(token);
+        }
+
+        sbResult.append("<display>" + sbDisplay.toString() + "</display>");
+
+        if (match("v") != true ||
+            match("=") != true)
+        {
+            return null;
+        }
+
+        StringBuilder sbVocalization = new StringBuilder();
+
+        while (this.tokenCursor < this.tokens.size())
+        {
+            String token = nextToken();
+
+            if (token.equals("|") == true)
+            {
+                break;
+            }
+
+            sbVocalization.append(token);
+        }
+
+        sbResult.append("<vocalization>" + sbVocalization.toString() + "</vocalization>");
+        sbResult.append("</translation>");
+
+        return sbResult;
+    }
+
     public boolean match(String required)
     {
         String token = nextToken();

diff --git a/...o_xml_1/wiktionary_article_to_xml_1/l10n/l10nWiktionaryArticleToXml1Console_de.properties b/...o_xml_1/wiktionary_article_to_xml_1/l10n/l10nWiktionaryArticleToXml1Console_de.properties
@@ -0,0 +1,45 @@
+# Copyright (C) 2015-2017 Stephan Kreutzer
+#
+# This file is part of wiktionary_article_to_xml_1, a submodule of the
+# wiktionary_dump_to_xml_1 package.
+#
+# wiktionary_article_to_xml_1 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License version 3 or any later version,
+# as published by the Free Software Foundation.
+#
+# wiktionary_article_to_xml_1 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License 3 for more details.
+#
+# You should have received a copy of the GNU Affero General Public License 3
+# along with wiktionary_article_to_xml_1. If not, see <http://www.gnu.org/licenses/>.
+
+messageArgumentsMissingUsage = Verwendung:
+messageParameterList = job-datei ergebnis-info-datei
+messageResultInfoFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Ergebnisinformationsdatei "{0}" ermitteln.
+messageResultInfoPathIsntAFile = Ergebnisinformationspfad "{0}" existiert bereits, referenziert aber keine Datei.
+messageResultInfoFileIsntWritable = Ergebnisinformationsdatei "{0}" ist nicht schreibbar.
+messageJobFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Jobdatei "{0}" ermitteln.
+messageJobFileDoesntExist = Jobdatei "{0}" existiert nicht.
+messageJobPathIsntAFile = Jobpfad "{0}" ist keine Datei.
+messageJobFileIsntReadable = Jobdatei "{0}" ist nicht lesbar.
+messageCallDetails = Aufgerufen mit Jobdatei "{0}" und Ergebnisinformationsdatei "{1}".
+messageJobFileEntryIsMissingAnAttribute = Element "{1} in Jobdatei "{0}" fehlt das "{2}"-Attribut.
+messageJobFileElementConfiguredMoreThanOnce = Element "{1}" mehr als einmal konfiguriert in Jobdatei "{0}".
+messageInputFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Eingabedatei "{0}" wie in Jobdatei "{1}" hinterlegt ermitteln.
+messageInputFileDoesntExist = Die Eingabe-Datei "{1}" laut Job-Datei "{0}" existiert nicht.
+messageInputPathIsntAFile = Der Eingabe-Pfad "{1}" laut Job-Datei "{0}" referenziert keine Datei.
+messageInputFileIsntReadable = Die Eingabe-Datei "{1}" laut Job-Datei "{0}" ist nicht lesbar.
+messageOutputFileCantGetCanonicalPath = Kann keinen kanonischen Pfad für die Ausgabedatei "{0}" wie in Jobdatei "{1}" hinterlegt ermitteln.
+messageOutputPathIsntAFile = Ausgabepfad "{0}" wie in Jobdatei "{1}" hinterlegt existiert, ist aber keine Datei.
+messageOutputFileIsntWritable = Ausgabedatei "{0}" wie in Jobdatei "{1}" hinterlegt existiert, ist aber nicht überschreibbar.
+messageJobFileErrorWhileReading = Während die Jobdatei "{0}" ausgelesen wurde, ist ein Fehler aufgetreten.
+messageJobFileInputFileIsntConfigured = Keine Eingabedatei mittels Element "{1}" in Jobdatei "{0}" konfiguriert.
+messageJobFileOutputFileIsntConfigured = Keine Ausgabedatei mittels Element "{1}" in Jobdatei "{0}" konfiguriert.
+messageTokenizerCharacterIsRepeating = Tokenizer: Zeichen ''{0}'' wiederholt sich.
+messageTokenizerInvalidCharacter = Tokenizer: Ungültiges Zeichen ''{0}'' ({1}).
+messageTokenizerUnknownCharacter = Tokenizer: Unbekanntes Zeichen ''{0}'' ({1}).
+messageTokenizerErrorWhileTokenizing = Tokenizer: Es ist ein Fehler während des Tokenisierens aufgetreten.
+messageParserErrorWhileParsing = Parser: Es ist ein Fehler während des Parsens aufgetreten.
+messageParserFailed = Die Eingabedatei "{0}" konnte nicht erfolgreich geparst werden.
diff --git a/...xml_1/wiktionary_article_to_xml_1/l10n/l10nWiktionaryArticleToXml1ParserGerman.properties b/...xml_1/wiktionary_article_to_xml_1/l10n/l10nWiktionaryArticleToXml1ParserGerman.properties
@@ -18,3 +18,4 @@
 
 messageParserNoMoreTokens = Parser: Run out of tokens while more tokens were expected.
 messageParsingError = Parsing error: "{0}" expected, found "{1}".
+messageParsingOfSequenceAborted = Parsing of sequence "{0}" aborted by termination "{1}".
diff --git a/..._1/wiktionary_article_to_xml_1/l10n/l10nWiktionaryArticleToXml1ParserGerman_de.properties b/..._1/wiktionary_article_to_xml_1/l10n/l10nWiktionaryArticleToXml1ParserGerman_de.properties
@@ -0,0 +1,21 @@
+# Copyright (C) 2017 Stephan Kreutzer
+#
+# This file is part of wiktionary_article_to_xml_1, a submodule of the
+# wiktionary_dump_to_xml_1 package.
+#
+# wiktionary_article_to_xml_1 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License version 3 or any later version,
+# as published by the Free Software Foundation.
+#
+# wiktionary_article_to_xml_1 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License 3 for more details.
+#
+# You should have received a copy of the GNU Affero General Public License 3
+# along with wiktionary_article_to_xml_1. If not, see <http://www.gnu.org/licenses/>.
+
+
+messageParserNoMoreTokens = Parser: Keine weiteren Tokens vorgefunden, obwohl weitere erwartet werden.
+messageParsingError = Parsing-Fehler: "{0}" erwartet, "{1}" vorgefunden.
+messageParsingOfSequenceAborted = Parsing der Sequenz "{0}" abgebrochen durch Termination "{1}".