Skip to content

Commit

Permalink
Added Soundcloud scraper and parser
Browse files Browse the repository at this point in the history
  • Loading branch information
henrique-coder committed Oct 1, 2024
1 parent 59f0daf commit 769bc57
Showing 1 changed file with 96 additions and 6 deletions.
102 changes: 96 additions & 6 deletions streamsnapper/streamsnapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
from re import sub as re_sub, compile as re_compile
from unicodedata import normalize
from locale import getlocale
from datetime import datetime
from typing import Any, Dict, List, Literal, Optional, Callable, Union, Type

# Third-party imports
from pysmartdl2 import SmartDL
from scrapetube import get_search as scrape_youtube_search, get_playlist as scrape_youtube_playlist, get_channel as scrape_youtube_channel
from requests import head
from yt_dlp import YoutubeDL, utils as yt_dlp_utils
from sclib import SoundcloudAPI, Track as SoundcloudTrack

# Local imports
from .exceptions import *
Expand Down Expand Up @@ -91,7 +93,7 @@ def __init__(self, enable_ytdlp_log: bool = True) -> None:
else:
self.system_language: str = 'en'

self.media_info: Dict[str, Any] = {}
self.general_info: Dict[str, Any] = {}

self.best_video_streams: List[Dict[str, Any]] = []
self.best_video_stream: Dict[str, Any] = {}
Expand Down Expand Up @@ -161,7 +163,7 @@ def analyze_info(self) -> None:
for chapter in get_value(data, 'chapters', convert_to=list, default_to=[])
]

media_info = {
general_info = {
'fullUrl': f'https://www.youtube.com/watch?v={id_}',
'shortUrl': f'https://youtu.be/{id_}',
'embedUrl': f'https://www.youtube.com/embed/{id_}',
Expand Down Expand Up @@ -196,13 +198,13 @@ def analyze_info(self) -> None:
]
}

for thumbnail in media_info['thumbnails']:
for thumbnail in general_info['thumbnails']:
r = head(thumbnail, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}, allow_redirects=True, timeout=5)

if r.status_code != 200:
media_info['thumbnails'].remove(thumbnail)
general_info['thumbnails'].remove(thumbnail)

self.media_info = dict(sorted(media_info.items()))
self.general_info = dict(sorted(general_info.items()))

def analyze_video_streams(self, preferred_quality: Literal['all', 'best', '144p', '240p', '360p', '480p', '720p', '1080p', '1440p', '2160p', '4320p'] = 'all') -> None:
"""
Expand Down Expand Up @@ -551,7 +553,95 @@ def get_channel_videos(self, channel_id: str = None, channel_url: str = None, ch
class SoundCloud:
"""A class for extracting and formatting data from SoundCloud tracks and playlists, facilitating access to general track information and audio streams."""

pass
def __init__(self) -> None:
"""
Initialize the SoundCloud class.
"""

self._extractor: Type[SoundCloud.Extractor] = self.Extractor()
self._soundcloud_api: SoundcloudAPI = SoundcloudAPI(client_id='gJUfQ83SeoGM0qvM3VetdqVTDyHmSusF')
self._soundcloud_track: SoundcloudTrack = None

self.general_info: Dict[str, Any] = {}
self.best_audio_stream: Dict[str, Any] = {}
self.best_audio_download_url: Optional[str] = None

def run(self, url: str) -> None:
"""
Run the process of extracting and formatting data from a SoundCloud track or playlist.
:param url: The SoundCloud track or playlist URL to extract data from.
:raises ScrapingError: If an error occurs while scraping the SoundCloud track.
"""

try:
self._soundcloud_track = self._soundcloud_api.resolve(url)
except Exception as e:
raise ScrapingError(f'Error occurred while scraping SoundCloud track: "{url}"') from e

def analyze_info(self) -> None:
"""
Extract and format relevant information.
"""

self.general_info = {
'id': self._soundcloud_track.id,
'userId': self._soundcloud_track.user_id,
'username': self._soundcloud_track.user['username'],
'userAvatar': self._soundcloud_track.user['avatar_url'].replace('-large', '-original'),
'title': self._soundcloud_track.title,
'artist': self._soundcloud_track.artist,
'duration': self._soundcloud_track.duration,
'fullUrl': self._soundcloud_track.permalink_url,
'thumbnail': self._soundcloud_track.artwork_url.replace('-large', '-original'),
'commentCount': self._soundcloud_track.comment_count,
'likeCount': self._soundcloud_track.likes_count,
'downloadCount': self._soundcloud_track.download_count,
'playbackCount': self._soundcloud_track.playback_count,
'repostCount': self._soundcloud_track.reposts_count,
'uploadTimestamp': int(datetime.fromisoformat(self._soundcloud_track.created_at.replace('Z', '+00:00')).timestamp()),
'lastModifiedTimestamp': int(datetime.fromisoformat(self._soundcloud_track.last_modified.replace('Z', '+00:00')).timestamp()),
'isCommentable': self._soundcloud_track.commentable,
'description': self._soundcloud_track.description,
'genre': self._soundcloud_track.genre,
'tags': self._soundcloud_track.tag_list,
'license': self._soundcloud_track.license,
}

def generate_audio_stream(self) -> None:
"""
Extract and format the best audio stream.
"""

self.best_audio_download_url = self._soundcloud_track.get_stream_url()

class Extractor:
"""A class for extracting data from SoundCloud URLs and searching for SoundCloud tracks."""

def __init__(self) -> None:
"""Initialize the Extractor class with some regular expressions for analyzing SoundCloud URLs."""

self._track_id_regex = re_compile(r'(?:soundcloud\.com/|snd\.sc/)([^/]+)/(?!sets)([^/]+)')
self._playlist_id_regex = re_compile(r'(?:soundcloud\.com/|snd\.sc/)([^/]+)/sets/([^/]+)')

def extract_track_slug(self, url: str) -> Optional[str]:
"""
Extract the SoundCloud track slug from a URL.
:param url: The URL to extract the track slug from.
:return: The extracted track slug. If no track slug is found, return None.
"""

found_match = self._track_id_regex.search(url)
return f'{found_match.group(1)}/{found_match.group(2)}' if found_match else None

def extract_playlist_slug(self, url: str) -> Optional[str]:
"""
Extract the SoundCloud playlist slug from a URL.
:param url: The URL to extract the playlist slug from.
:return: The extracted playlist slug. If no playlist slug is found, return None.
"""

found_match = self._playlist_id_regex.search(url)
return f'{found_match.group(1)}/sets/{found_match.group(2)}' if found_match else None

class Downloader:
"""
Expand Down

0 comments on commit 769bc57

Please sign in to comment.