Skip to content

Commit

Permalink
Improve lyrics lookup by removing remix & remaster info in query. (#266)
Browse files Browse the repository at this point in the history
Co-authored-by: Thang Pham <phamducthang1234@gmail.com>
  • Loading branch information
Icelk and aome510 authored Oct 10, 2023
1 parent d051d15 commit bd1659a
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 0 deletions.
1 change: 1 addition & 0 deletions lyric_finder/rustfmt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
edition="2021"
62 changes: 62 additions & 0 deletions lyric_finder/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ impl Client {

/// Search songs satisfying a given `query`.
pub async fn search_songs(&self, query: &str) -> anyhow::Result<Vec<search::Result>> {
let query = improve_query(query);

log::debug!("search songs: query={query}");

let body = self
Expand Down Expand Up @@ -136,6 +138,66 @@ impl Default for Client {
}
}

/// Returns `query` without `remaster` & `remix` information from track/artist query.
/// Returned value is lowercase.
/// These caused wildly invalid lyrics to be found.
/// (try yourself adding remastered 2011 to a song's name when searching in Genius!)
fn improve_query(query: &str) -> String {
// flag for doing something wrong if the song name (after removing remix metadata) is too short.
const SONG_MIN_LENGTH_WO_REMIX_METADATA: usize = 3;

let is_dash = |c: char| c == '-';

// reverse finder for non-filler (space, dashes) chars before an index.
// Acts like a trim to remove undesired spaces and dashes.
let rfind_non_filler = |s: &str, idx: usize| {
let Some(s) = s.get(..idx) else { return idx };
s.char_indices()
.rfind(|(_, c)| !(is_dash(*c) || c.is_whitespace()))
.map_or(idx, |(idx, c)| idx + c.len_utf8())
};
// used to handle longer variants of words: `remixed`, `remastered`, etc.
let end_of_word = |s: &str, idx: usize| {
let Some(s) = s.get(idx..) else { return idx };
s.find(|c: char| !c.is_alphanumeric())
.map_or(idx, |found| found + idx)
};

let mut query = query.to_lowercase();
// remove "xxxx Remaster" from the query
// For example, `{song} xxxx Remastered {artists}` becomes `{song} {artists}`.
if let Some(remaster_start) = query.find("remaster") {
let end = remaster_start + "remaster".len();
let end = end_of_word(&query, end);

let mut start = remaster_start.saturating_sub(1);
let prev = query.get(..remaster_start.saturating_sub(2)).unwrap_or("");
let end_of_prev_word = prev.rfind(' ').unwrap_or(0);

if let Some(year) = query.get(end_of_prev_word + 1..remaster_start.saturating_sub(1)) {
if year.chars().all(|c| c.is_whitespace() || c.is_numeric()) {
start = end_of_prev_word;
}
}
start = rfind_non_filler(&query, start);
query.drain(start..end);
}
// remove "- xxxx yyy remix" from the query
// For example, `{song} - xxxx yyy remix {artists}` becomes `{song} {artists}`.
if let Some(remix_start) = query.find("remix") {
let end = remix_start + "remix".len();
let end = end_of_word(&query, end);

if let Some(metadata_start) = query.rfind(is_dash) {
if metadata_start >= SONG_MIN_LENGTH_WO_REMIX_METADATA {
let start = rfind_non_filler(&query, metadata_start);
query.drain(start..end);
}
}
}
query
}

mod parse {
use html5ever::tendril::TendrilSink;
use html5ever::*;
Expand Down

0 comments on commit bd1659a

Please sign in to comment.