Skip to content

Commit

Permalink
resolve missing subtitles
Browse files Browse the repository at this point in the history
  • Loading branch information
codingPF committed Feb 24, 2024
1 parent d231fb6 commit 0751f40
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 34 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package de.mediathekview.mserver.base.webaccess;

import okhttp3.ConnectionPool;
import okhttp3.Headers;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
Expand All @@ -11,7 +12,12 @@
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;

import com.google.gson.Gson;
import com.google.gson.JsonElement;

import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;

import static jakarta.ws.rs.core.HttpHeaders.CONTENT_LENGTH;
Expand Down Expand Up @@ -41,11 +47,32 @@ public JsoupConnection(final int timeout, final int threadPoolSize) {
* @throws IOException If no connection to the url could be opened.
*/
public String requestBodyAsString(final String url) throws IOException {
return requestBodyAsString(url, null);

}
/**
* Request an url and receive the body as String. Add headers as a string map.
* @param url
* @param headerMap
* @return
* @throws IOException
*/
public String requestBodyAsString(final String url, final Map<String, String> headerMap) throws IOException {
int retry = 0;
int httpResponseCode;
final String responseString = "";
do {
final Request request = new Request.Builder().url(url).build();
okhttp3.Headers.Builder headerBuilder = new Headers.Builder();
if (headerMap != null) {
for (Entry<String, String> headerValue : headerMap.entrySet()) {
headerBuilder.add(headerValue.getKey(), headerValue.getValue());
}
}
Request request = new Request.Builder()
.url(url)
.headers(headerBuilder.build())
.build();

try (final Response response = client.newCall(request).execute()) {
httpResponseCode = response.code();
if (response.body() == null || httpResponseCode == 404 || httpResponseCode == 410) {
Expand All @@ -62,6 +89,17 @@ public String requestBodyAsString(final String url) throws IOException {
return responseString;
}

/**
* Request an url and receive the body as HTML JSOUP Document
*
* @param url The url to request.
* @return request body as HTML JSOUP Document
* @throws IOException If no connection to the url could be opened.
*/
public JsonElement requestBodyAsJsonElement(final String url, final Map<String, String> headerMap) throws IOException {
return new Gson().fromJson(requestBodyAsString(url, headerMap), JsonElement.class);
}

/**
* Request an url and receive the body as HTML JSOUP Document
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ protected RecursiveTask<Set<Film>> createCrawlerTask() {
printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), allVideos.size());
getAndSetMaxCount(allVideos.size());
//
// History (top categories) > children >
// History (top categories) > children > VideoItem > Episode > Episode2Film
final Set<OrfOnVideoInfoDTO> historyVideos = processHistoryUrlToCrawl();
allVideos.addAll(historyVideos);
printMessage(ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), allVideos.size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
import de.mediathekview.mlib.daten.GeoLocations;
import de.mediathekview.mlib.daten.Resolution;
import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.orfon.OrfOnConstants;
import de.mediathekview.mserver.crawler.orfon.OrfOnVideoInfoDTO;

import java.io.IOException;
import java.lang.reflect.Type;
import java.net.MalformedURLException;
import java.net.URL;
Expand Down Expand Up @@ -47,11 +50,19 @@ public class OrfOnEpisodeDeserializer implements JsonDeserializer<OrfOnVideoInfo
private static final String TAG_VIDEO_QUALITY = "quality_key";
private static final String TAG_VIDEO_URL = "src";
//
private static final String[] TAG_SUBTITLE_SMI = {"_embedded", "subtitle", "sami_url"};
private static final String[] TAG_SUBTITLE_SRT = {"_embedded", "subtitle", "srt_url"};
private static final String[] TAG_SUBTITLE_TTML = {"_embedded", "subtitle", "ttml_url"};
private static final String[] TAG_SUBTITLE_VTT = {"_embedded", "subtitle", "vtt_url"};
private static final String[] TAG_SUBTITLE_XML = {"_embedded", "subtitle", "xml_url"};
private static final String[] TAG_SUBTITLE_SECTION = {"_embedded", "subtitle"};
private static final String TAG_SUBTITLE_SMI = "sami_url";
private static final String TAG_SUBTITLE_SRT = "srt_url";
private static final String TAG_SUBTITLE_TTML = "ttml_url";
private static final String TAG_SUBTITLE_VTT = "vtt_url";
private static final String TAG_SUBTITLE_XML = "xml_url";
//
private AbstractCrawler crawler = null;
//

public OrfOnEpisodeDeserializer(AbstractCrawler crawler) {
this.crawler = crawler;
}

@Override
public OrfOnVideoInfoDTO deserialize(
Expand All @@ -71,10 +82,9 @@ public OrfOnVideoInfoDTO deserialize(
parseGeoLocations(JsonUtils.getElementValueAsString(jsonElement, TAG_RIGHT)),
parseSubtitleSource(JsonUtils.getElementValueAsString(jsonElement, TAG_SUBTITLE)),
parseUrl(jsonElement),
parseSubtitleUrls(jsonElement)
buildOrResolveSubs(jsonElement)

);
//LOG.debug("{}",jsonElement );

if (aFilm.getVideoUrls().isEmpty()){
LOG.debug("#####videoUrlEmpty#######");
Expand All @@ -91,21 +101,34 @@ public OrfOnVideoInfoDTO deserialize(
LOG.debug("{}",jsonElement );
LOG.debug("############");
}
if (aFilm.getSubtitleSource().isPresent() && aFilm.getSubtitleUrls().isEmpty()) {
LOG.debug("getSubtitleSource but no getSubtitleUrls {}", aFilm.getId().get());
}

// "genre_title": "Wetter",
// "headline": "Wetter Tirol vom 05.01.2024",
// "profile_title": "Wetter Tirol",
// "title": "Wetter Tirol vom 05.01.2024",
// "share_subject": "Wetter Tirol vom 05.01.2024 vom 05.01.2024 um 19:20 Uhr",
// "sub_headline": "Wetter Tirol",

//
return aFilm;
}

private Optional<Set<URL>> buildOrResolveSubs(JsonElement jsonElement) {
Optional<String> subtitleSource = JsonUtils.getElementValueAsString(jsonElement, TAG_SUBTITLE);
Optional<JsonElement> embeddedSubtitleSection = JsonUtils.getElement(jsonElement, TAG_SUBTITLE_SECTION);
Optional<Set<URL>> setOfSubs = Optional.empty();
if (embeddedSubtitleSection.isPresent()) {
setOfSubs = parseSubtitleUrls(embeddedSubtitleSection.get());
} else if (subtitleSource.isPresent()) {
Map<String, String> myMap = Map.ofEntries(
Map.entry("Authorization", OrfOnConstants.AUTH),
Map.entry("Accept-Charset", "UTF_8"),
Map.entry("User-Agent", "Mozilla"),
Map.entry("Accept-Encoding", "*"));
JsonElement newRequestForSubs = null;
try {
newRequestForSubs = crawler.getConnection().requestBodyAsJsonElement(subtitleSource.get().toString(), myMap);
setOfSubs = parseSubtitleUrls(newRequestForSubs);
} catch (IOException e) {
LOG.error("Failed to resolve subtitle from {} error {}", subtitleSource, e);
}

}
return setOfSubs;
}

private Optional<URL> parseSubtitleSource(Optional<String> text) {
Optional<URL> sub = Optional.empty();
if (text.isPresent()) {
Expand All @@ -119,7 +142,6 @@ private Optional<URL> parseSubtitleSource(Optional<String> text) {

}


private Optional<Set<URL>> parseSubtitleUrls(JsonElement element) {
Set<URL> urls = new HashSet<>();
JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_SMI).ifPresent( stringUrl -> toURL(stringUrl).ifPresent(urls::add));
Expand Down Expand Up @@ -150,8 +172,7 @@ private Optional<Map<Resolution, FilmUrl>> parseUrl(JsonElement jsonElement) {
LOG.debug("unkown video type {} ", jsonElement);
}
}



Optional<Map<Resolution, FilmUrl>> urls = Optional.empty();
Optional<String> codec = Optional.empty(); //
if (jsonElement.getAsJsonObject().has(TAG_VIDEO) &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.google.gson.*;

import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.PagedElementListDTO;
import de.mediathekview.mserver.crawler.orfon.OrfOnVideoInfoDTO;

Expand All @@ -12,7 +13,11 @@
public class OrfOnEpisodesDeserializer implements JsonDeserializer<PagedElementListDTO<OrfOnVideoInfoDTO>> {
private static final String[] TAG_NEXT_PAGE = {"_links", "next", "href"};
private static final String[] TAG_ITEMS = {"_embedded", "items"};
private static final OrfOnEpisodeDeserializer itemDeserializer = new OrfOnEpisodeDeserializer();
private OrfOnEpisodeDeserializer itemDeserializer = null;

public OrfOnEpisodesDeserializer(AbstractCrawler crawler) {
itemDeserializer = new OrfOnEpisodeDeserializer(crawler);
}

@Override
public PagedElementListDTO<OrfOnVideoInfoDTO> deserialize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> deserialize(
} else {
LOG.info("No video_items or children tag found {}",JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE) );
}
/*
LOG.debug("OrfOnHistoryChildrenDeserializer {} - {} - {}",
JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE),
JsonUtils.getElementValueAsString(item, TAG_TARGET_URL),
JsonUtils.getElementValueAsString(item, TAG_TARGET_URL2));*/

}
}
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> parseSection(JsonArray itemArr
} else {
LOG.debug("missing url for {}", title);
}
//LOG.debug("History Item {} {}", title, url);
}
return items;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ public PagedElementListDTO<OrfOnBreadCrumsUrlDTO> deserialize(
final Optional<String> id = JsonUtils.getElementValueAsString(element, TAG_FILM_ID);
if (id.isPresent()) {
final String url = OrfOnConstants.EPISODE + "/" + id.get();
//LOG.debug("found {} {} {}", id, name, url);
collectIds.addElement(new OrfOnBreadCrumsUrlDTO(id.get(), url));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public OrfOnEpisodeTask(AbstractCrawler crawler, Queue<OrfOnBreadCrumsUrlDTO> ur

@Override
protected JsonDeserializer<OrfOnVideoInfoDTO> getParser(OrfOnBreadCrumsUrlDTO aDTO) {
return new OrfOnEpisodeDeserializer();
return new OrfOnEpisodeDeserializer(this.crawler);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public OrfOnEpisodesTask(AbstractCrawler crawler, Queue<OrfOnBreadCrumsUrlDTO> u

@Override
protected JsonDeserializer<PagedElementListDTO<OrfOnVideoInfoDTO>> getParser(OrfOnBreadCrumsUrlDTO aDTO) {
return new OrfOnEpisodesDeserializer();
return new OrfOnEpisodesDeserializer(this.crawler);
}

@Override
Expand Down

0 comments on commit 0751f40

Please sign in to comment.