From 7d2eb55d8c071e4d32601494ebff37764f0c5ec0 Mon Sep 17 00:00:00 2001 From: Subalalitha Date: Fri, 18 Mar 2022 21:36:24 +0530 Subject: [PATCH] [TIKA-3704] Include support for FirstLanguage Translate API --- .../impl/FirstLanguageTranslator.java | 321 ++++++++++++++++++ ....apache.tika.language.translate.Translator | 1 + .../impl/translator.firstlanguage.properties | 20 ++ .../impl/FirstLanguageTranslatorTest.java | 43 +++ 4 files changed, 385 insertions(+) create mode 100644 tika-translate/src/main/java/org/apache/tika/language/translate/impl/FirstLanguageTranslator.java create mode 100644 tika-translate/src/main/resources/org/apache/tika/language/translate/impl/translator.firstlanguage.properties create mode 100644 tika-translate/src/test/java/org/apache/tika/language/translate/impl/FirstLanguageTranslatorTest.java diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/impl/FirstLanguageTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/FirstLanguageTranslator.java new file mode 100644 index 0000000000..9c273bcf70 --- /dev/null +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/impl/FirstLanguageTranslator.java @@ -0,0 +1,321 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.language.translate.impl; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.BufferedReader; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + +import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.tika.exception.TikaException; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An implementation of a REST client for the FirstLanguage API. + * You can sign up for an API Key online on the FirstLanguage Developer Portal + * and set your API Key in the translator.firstlanguage.properties file. + */ +public class FirstLanguageTranslator extends AbstractTranslator { + + private static final Logger LOG = LoggerFactory.getLogger(FirstLanguageTranslator.class); + + private static final String FL_TRANSLATE_URL_BASE = "https://api.firstlanguage.in/api/translate"; + + private static final String DEFAULT_KEY = "dummy-key"; + + private WebClient client; + + private String apiKey; + + private boolean isAvailable; + + public FirstLanguageTranslator() { + this.client = WebClient.create(FL_TRANSLATE_URL_BASE); + this.isAvailable = true; + Properties config = new Properties(); + try { + config.load(FirstLanguageTranslator.class + .getResourceAsStream( + "translator.firstlanguage.properties")); + this.apiKey = config.getProperty("translator.api-key"); + if (this.apiKey.equals(DEFAULT_KEY)) + this.isAvailable = false; + } catch (Exception e) { + LOG.warn("Couldn't read config file", e); + isAvailable = false; + } + } + + + /** + * This function sets the apiKey variable to the value of the apiKey parameter + * + * @param apiKey The API key you got from the FirstLanguage Portal. + */ + public void setApiKey(String apiKey) { + this.apiKey = apiKey; + this.isAvailable=true; + } + + /** + * This function translates the text extracted from the URL passed + * to the target language specified. This function will read the file mentioned + * in the URL and send the response as plaintext irrespective of the source file + * contentType + * + * + * @param url The url from which the text to be translated is extracted. + * @param targetLanguage The target language. + * @param sourceLanguage The source language. + * @param contentType The content type of the url. It can be plaintext, html, pdf or docx + * @param pathWithFileNameToSave The path where the translated PDF or DOCX is to be saved. + * This should include the filename. + * + */ + public String translatePDFOrDOCXFile(String url, String targetLanguage, String contentType, + String pathWithFileNameToSave) throws TikaException, IOException { + if (!this.isAvailable) + return url; + + if(url == null || url.isEmpty()){ + throw new TikaException("URL cannot be null or empty"); + } + + if(contentType == null || contentType.isEmpty()){ + throw new TikaException("Content Type cannot be null or empty"); + } + + if(contentType != "pdf" && contentType != "docx"){ + throw new TikaException("Content Type must be pdf or docx for this method."); + } + if(pathWithFileNameToSave == null || pathWithFileNameToSave.isEmpty()){ + throw new TikaException("Path with filename to save cannot be null or empty"); + } + + final List providers = new ArrayList<>(); + JacksonJsonProvider jacksonJsonProvider = new JacksonJsonProvider(); + providers.add(jacksonJsonProvider); + + client = WebClient.create(FL_TRANSLATE_URL_BASE, providers); + + ObjectMapper requestMapper = new ObjectMapper(); + ObjectNode jsonNode = requestMapper.createObjectNode(); + ObjectNode inputNode = requestMapper.createObjectNode(); + inputNode.put("lang", targetLanguage); + inputNode.put("url", url); + inputNode.put("contentType", contentType); + inputNode.put("preserveFormat", "true"); + jsonNode.put("input", inputNode); + //make the request + Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON) + .header("apikey", apiKey).post(jsonNode); + + try { + byte[] ba1 = new byte[1024]; + int baLength=0; + FileOutputStream fos1 = new FileOutputStream(pathWithFileNameToSave); + InputStream is1 = (InputStream) response.getEntity(); + while ((baLength = is1.read(ba1)) != -1) { + fos1.write(ba1, 0, baLength); + } + fos1.flush(); + fos1.close(); + is1.close(); + + }catch (IOException e) { + throw new TikaException("Error while reading response"); + } + + return "File Translated"; + } + + /** + * This function translates the text extracted from the URL passed + * to the target language specified. This function will read the file mentioned + * in the URL and send the response as plaintext irrespective of the source file + * contentType + * + * + * @param url The url from which the text to be translated is extracted. + * @param targetLanguage The target language. + * @param sourceLanguage The source language. + * @param contentType The content type of the url. It can be plaintext, html, pdf or docx + * + * @return The translated text. + */ + public String translateFromURL(String url, String targetLanguage, String contentType) throws TikaException, IOException { + if (!this.isAvailable) + return url; + + if(url == null || url.isEmpty()){ + throw new TikaException("URL cannot be null or empty"); + } + + if(contentType == null || contentType.isEmpty()){ + throw new TikaException("Content Type cannot be null or empty"); + } + + if(contentType != "plaintext" && contentType != "html" && contentType != "pdf" && contentType != "docx"){ + throw new TikaException("Content Type must be plaintext, html, pdf or docx"); + } + + final List providers = new ArrayList<>(); + JacksonJsonProvider jacksonJsonProvider = new JacksonJsonProvider(); + providers.add(jacksonJsonProvider); + + client = WebClient.create(FL_TRANSLATE_URL_BASE, providers); + + ObjectMapper requestMapper = new ObjectMapper(); + ObjectNode jsonNode = requestMapper.createObjectNode(); + ObjectNode inputNode = requestMapper.createObjectNode(); + inputNode.put("lang", targetLanguage); + inputNode.put("url", url); + inputNode.put("contentType", contentType); + jsonNode.put("input", inputNode); + //make the request + Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON) + .header("apikey", apiKey).post(jsonNode); + + StringBuilder responseText = new StringBuilder(); + try (InputStreamReader inputStreamReader = new InputStreamReader( + (InputStream) response.getEntity(), UTF_8); + BufferedReader reader = new BufferedReader(inputStreamReader); + ) { + String line; + while ((line = reader.readLine()) != null) { + responseText.append(line); + } + } + + try { + ObjectMapper responseMapper = new ObjectMapper(); + if(contentType == "html" || contentType == "plaintext"){ + JsonNode jsonResp = responseMapper.readTree(responseText.toString()); + + if (jsonResp.findValuesAsText("generated_text") != null && jsonResp.findValuesAsText("generated_text").size() > 0) { + return jsonResp.findValuesAsText("generated_text").get(0); + } else { + throw new TikaException("Exception while Translating..."); + } + }else if(contentType == "pdf" || contentType == "docx"){ + return responseText.toString(); + } + } catch (JsonParseException e) { + throw new TikaException("Error requesting translation '" + + "' to '" + targetLanguage + "', JSON response " + + "from FirstLanguage Server is not well formatted: " + responseText.toString()); + } + return url; + } + + + @Override + /** + * This function translates the text passed to the target language specified. + * + * @param text The text to be translated. + * @param targetLanguage The target language. + * @param sourceLanguage The source language. + * + * @return The translated text. + */ + public String translate(String text, String sourceLanguage, + String targetLanguage) throws TikaException, IOException { + if (!this.isAvailable) + return text; + + final List providers = new ArrayList<>(); + JacksonJsonProvider jacksonJsonProvider = new JacksonJsonProvider(); + providers.add(jacksonJsonProvider); + + client = WebClient.create(FL_TRANSLATE_URL_BASE, providers); + + ObjectMapper requestMapper = new ObjectMapper(); + ObjectNode jsonNode = requestMapper.createObjectNode(); + ObjectNode inputNode = requestMapper.createObjectNode(); + inputNode.put("lang", targetLanguage); + inputNode.put("text", text); + jsonNode.put("input", inputNode); + //make the request + Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON) + .header("apikey", apiKey).post(jsonNode); + + StringBuilder responseText = new StringBuilder(); + try (InputStreamReader inputStreamReader = new InputStreamReader( + (InputStream) response.getEntity(), UTF_8); + BufferedReader reader = new BufferedReader(inputStreamReader); + ) { + String line; + while ((line = reader.readLine()) != null) { + responseText.append(line); + } + } + + try { + ObjectMapper responseMapper = new ObjectMapper(); + JsonNode jsonResp = responseMapper.readTree(responseText.toString()); + + if (jsonResp.findValuesAsText("generated_text") != null && jsonResp.findValuesAsText("generated_text").size() > 0) { + return jsonResp.findValuesAsText("generated_text").get(0); + } else { + throw new TikaException("Exception while Translating..."); + } + } catch (JsonParseException e) { + throw new TikaException("Error requesting translation from '" + + sourceLanguage + "' to '" + targetLanguage + "', JSON response " + + "from FirstLanguage Server is not well formatted: " + responseText.toString()); + } + + } + + @Override + public String translate(String text, String targetLanguage) + throws TikaException, IOException { + if (!this.isAvailable) + return text; + + String sourceLanguage = detectLanguage(text).getLanguage(); + return translate(text, sourceLanguage, targetLanguage); + } + + @Override + public boolean isAvailable() { + return this.isAvailable; + } + +} diff --git a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator index 71cc28df9d..3f293515f6 100644 --- a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator +++ b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator @@ -20,3 +20,4 @@ org.apache.tika.language.translate.impl.CachedTranslator org.apache.tika.language.translate.impl.JoshuaNetworkTranslator org.apache.tika.language.translate.impl.RTGTranslator org.apache.tika.language.translate.impl.MarianTranslator +org.apache.tika.language.translate.impl.FirstLanguageTranslator \ No newline at end of file diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/impl/translator.firstlanguage.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/impl/translator.firstlanguage.properties new file mode 100644 index 0000000000..0265a8091e --- /dev/null +++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/impl/translator.firstlanguage.properties @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# To use the Lingo24 translation service, you must set your API-key +# as described in Lingo24Translator. If you do not want translation +# please set the value to "dummy-key". + +translator.api-key=dummy-key diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/impl/FirstLanguageTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/impl/FirstLanguageTranslatorTest.java new file mode 100644 index 0000000000..a8625e5e3a --- /dev/null +++ b/tika-translate/src/test/java/org/apache/tika/language/translate/impl/FirstLanguageTranslatorTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.language.translate.impl; + +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertTrue; + +import org.apache.tika.language.translate.impl.FirstLanguageTranslator; + +public class FirstLanguageTranslatorTest { + + FirstLanguageTranslator translator; + + @Before + public void setUp() { + translator = new FirstLanguageTranslator(); + } + + @Test + public void testSimpleSpanishToEnglishTranslation() throws Exception { + String source = "Today is a good day"; + String expected = "இன்று நல்ல நாள்."; + String translated = translator.translate(source, "en", "ta"); + if (translator.isAvailable()) assertTrue("Translate " + source + " to " + expected + " (was " + translated + ")", + expected.equalsIgnoreCase(translated)); + } +}