Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[YouTube] Fix hashtags links extraction and escape HTML links #1032

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ public static String[] getYoutubeMusicKey()

try {
final String url = "https://music.youtube.com/sw.js";
final var headers = getOriginReferrerHeaders("https://music.youtube.com");
final var headers = getOriginReferrerHeaders(YOUTUBE_MUSIC_URL);
final String response = getDownloader().get(url, headers).responseBody();
musicClientVersion = getStringResultFromRegexArray(response,
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
Expand All @@ -843,18 +843,11 @@ public static String[] getYoutubeMusicKey()
}

@Nullable
public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navigationEndpoint)
throws ParsingException {
if (navigationEndpoint.has("webCommandMetadata")) {
// this case needs to be handled before the browseEndpoint,
// e.g. for hashtags in comments
final JsonObject metadata = navigationEndpoint.getObject("webCommandMetadata");
if (metadata.has("url")) {
return "https://www.youtube.com" + metadata.getString("url");
}
}
public static String getUrlFromNavigationEndpoint(
@Nonnull final JsonObject navigationEndpoint) {
if (navigationEndpoint.has("urlEndpoint")) {
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
String internUrl = navigationEndpoint.getObject("urlEndpoint")
.getString("url");
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
// remove https://www.youtube.com part to fall in the next if block
internUrl = internUrl.substring(23);
Expand All @@ -879,7 +872,9 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
|| internUrl.startsWith("/watch")) {
return "https://www.youtube.com" + internUrl;
}
} else if (navigationEndpoint.has("browseEndpoint")) {
}

if (navigationEndpoint.has("browseEndpoint")) {
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
final String browseId = browseEndpoint.getString("browseId");
Expand All @@ -892,26 +887,39 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
if (!isNullOrEmpty(canonicalBaseUrl)) {
return "https://www.youtube.com" + canonicalBaseUrl;
}
}

throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\""
+ browseEndpoint + "\")");
} else if (navigationEndpoint.has("watchEndpoint")) {
if (navigationEndpoint.has("watchEndpoint")) {
final StringBuilder url = new StringBuilder();
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint
.getObject("watchEndpoint").getString(VIDEO_ID));
url.append("https://www.youtube.com/watch?v=")
.append(navigationEndpoint.getObject("watchEndpoint")
.getString(VIDEO_ID));
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
.getString("playlistId"));
}
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
url.append("&t=").append(navigationEndpoint.getObject("watchEndpoint")
url.append("&t=")
.append(navigationEndpoint.getObject("watchEndpoint")
.getInt("startTimeSeconds"));
}
return url.toString();
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
}

if (navigationEndpoint.has("watchPlaylistEndpoint")) {
return "https://www.youtube.com/playlist?list="
+ navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
+ navigationEndpoint.getObject("watchPlaylistEndpoint")
.getString("playlistId");
}

if (navigationEndpoint.has("commandMetadata")) {
final JsonObject metadata = navigationEndpoint.getObject("commandMetadata")
.getObject("webCommandMetadata");
if (metadata.has("url")) {
return "https://www.youtube.com" + metadata.getString("url");
}
}

return null;
}

Expand All @@ -924,8 +932,7 @@ public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navi
* @return text in the JSON object or {@code null}
*/
@Nullable
public static String getTextFromObject(final JsonObject textObject, final boolean html)
throws ParsingException {
public static String getTextFromObject(final JsonObject textObject, final boolean html) {
if (isNullOrEmpty(textObject)) {
return null;
}
Expand All @@ -944,12 +951,12 @@ public static String getTextFromObject(final JsonObject textObject, final boolea
String text = run.getString("text");

if (html) {
text = Entities.escape(text);
if (run.has("navigationEndpoint")) {
final String url = getUrlFromNavigationEndpoint(run
.getObject("navigationEndpoint"));
final String url = getUrlFromNavigationEndpoint(
run.getObject("navigationEndpoint"));
if (!isNullOrEmpty(url)) {
text = "<a href=\"" + url + "\">" + text + "</a>";
text = "<a href=\"" + Entities.escape(url) + "\">" + Entities.escape(text)
+ "</a>";
}
}

Expand Down Expand Up @@ -1015,11 +1022,12 @@ public static String getAttributedDescription(
}

final String content = attributedDescription.getString("content");
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
if (content == null) {
return null;
}

final JsonArray commandRuns = attributedDescription.getArray("commandRuns");

final StringBuilder textBuilder = new StringBuilder();
int textStart = 0;

Expand All @@ -1038,12 +1046,7 @@ public static String getAttributedDescription(
continue;
}

final String url;
try {
url = getUrlFromNavigationEndpoint(navigationEndpoint);
} catch (final ParsingException e) {
continue;
}
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);

if (url == null) {
continue;
Expand All @@ -1062,9 +1065,9 @@ public static String getAttributedDescription(
.replaceFirst("^[/•] *", "");

textBuilder.append("<a href=\"")
.append(url)
.append(Entities.escape(url))
.append("\">")
.append(linkText)
.append(Entities.escape(linkText))
.append("</a>");

textStart = startIndex + length;
Expand All @@ -1081,13 +1084,12 @@ public static String getAttributedDescription(
}

@Nullable
public static String getTextFromObject(final JsonObject textObject) throws ParsingException {
public static String getTextFromObject(final JsonObject textObject) {
return getTextFromObject(textObject, false);
}

@Nullable
public static String getUrlFromObject(final JsonObject textObject) throws ParsingException {

public static String getUrlFromObject(final JsonObject textObject) {
if (isNullOrEmpty(textObject)) {
return null;
}
Expand All @@ -1108,8 +1110,7 @@ public static String getUrlFromObject(final JsonObject textObject) throws Parsin
}

@Nullable
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey)
throws ParsingException {
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) {
if (jsonObject.isString(theKey)) {
return jsonObject.getString(theKey);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,10 @@ public YoutubeChannelInfoItemExtractor(final JsonObject channelInfoItem) {
this.channelInfoItem = channelInfoItem;

boolean wHandle = false;
try {
final String subscriberCountText = getTextFromObject(
channelInfoItem.getObject("subscriberCountText"));
if (subscriberCountText != null) {
wHandle = subscriberCountText.startsWith("@");
}
} catch (final ParsingException ignored) {
final String subscriberCountText = getTextFromObject(
channelInfoItem.getObject("subscriberCountText"));
if (subscriberCountText != null) {
wHandle = subscriberCountText.startsWith("@");
}
this.withHandle = wHandle;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,11 +168,7 @@ public String getName() throws ParsingException {
title = playerResponse.getObject("videoDetails").getString("title");

if (isNullOrEmpty(title)) {
try {
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
}
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));

if (isNullOrEmpty(title)) {
throw new ParsingException("Could not get name");
Expand Down Expand Up @@ -285,21 +281,17 @@ public String getThumbnailUrl() throws ParsingException {
public Description getDescription() throws ParsingException {
assertPageFetched();
// Description with more info on links
try {
final String description = getTextFromObject(
getVideoSecondaryInfoRenderer().getObject("description"),
true);
if (!isNullOrEmpty(description)) {
return new Description(description, Description.HTML);
}
final String videoSecondaryInfoRendererDescription = getTextFromObject(
getVideoSecondaryInfoRenderer().getObject("description"),
true);
if (!isNullOrEmpty(videoSecondaryInfoRendererDescription)) {
return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
}

final String attributedDescription = getAttributedDescription(
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
if (!isNullOrEmpty(attributedDescription)) {
return new Description(attributedDescription, Description.HTML);
}
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
final String attributedDescription = getAttributedDescription(
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
if (!isNullOrEmpty(attributedDescription)) {
return new Description(attributedDescription, Description.HTML);
}

String description = playerResponse.getObject("videoDetails")
Expand Down Expand Up @@ -400,14 +392,8 @@ public long getTimeStamp() throws ParsingException {

@Override
public long getViewCount() throws ParsingException {
String views = null;

try {
views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
.getObject("videoViewCountRenderer").getObject("viewCount"));
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
}
String views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
.getObject("videoViewCountRenderer").getObject("viewCount"));

if (isNullOrEmpty(views)) {
views = playerResponse.getObject("videoDetails").getString("viewCount");
Expand Down Expand Up @@ -795,7 +781,7 @@ public String getErrorMessage() {
return getTextFromObject(playerResponse.getObject("playabilityStatus")
.getObject("errorScreen").getObject("playerErrorMessageRenderer")
.getObject("reason"));
} catch (final ParsingException | NullPointerException e) {
} catch (final NullPointerException e) {
return null; // No error message
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,10 @@ public static void setUp() throws Exception {
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
@Override public List<String> expectedDescriptionContains() {
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=XxaRBPyrnBU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=U-9tUEOFKNU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
}
@Override public long expectedLength() { return 434; }
@Override public long expectedViewCountAtLeast() { return 21229200; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
"httpMethod": "GET",
"url": "https://www.youtube.com/sw.js",
"headers": {
"Origin": [
"Referer": [
"https://www.youtube.com"
],
"Referer": [
"Origin": [
"https://www.youtube.com"
],
"Accept-Language": [
Expand All @@ -29,7 +29,7 @@
"https://www.youtube.com"
],
"alt-svc": [
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\""
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
],
"cache-control": [
"private, max-age\u003d0"
Expand All @@ -41,10 +41,10 @@
"same-origin; report-to\u003d\"youtube_main\""
],
"date": [
"Mon, 28 Nov 2022 20:27:36 GMT"
"Sun, 26 Feb 2023 17:48:54 GMT"
],
"expires": [
"Mon, 28 Nov 2022 20:27:36 GMT"
"Sun, 26 Feb 2023 17:48:54 GMT"
],
"p3p": [
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
Expand All @@ -59,9 +59,9 @@
"ESF"
],
"set-cookie": [
"YSC\u003ddaTQ98V-voQ; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dTue, 03-Mar-2020 20:27:36 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+452; expires\u003dWed, 27-Nov-2024 20:27:36 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
"YSC\u003dYJXWRWCYVkE; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 17:48:54 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+668; expires\u003dTue, 25-Feb-2025 17:48:54 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
],
"strict-transport-security": [
"max-age\u003d31536000"
Expand Down

Large diffs are not rendered by default.

Loading