From 4fb4948dccc3fe890ae8f87cf5273a44e92bae05 Mon Sep 17 00:00:00 2001 From: aaron-jack-manning Date: Sun, 21 Jan 2024 14:20:10 +1100 Subject: [PATCH] initial commit --- .gitignore | 1 + Cargo.toml | 14 +++++++ README.md | 5 +++ src/main.rs | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..955d2dc --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "khinsider" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.75" +clap = { version = "4.4.10", features = ["derive"] } +id3 = "1.13.1" +minreq = { version = "2.11.0", features = ["https"] } +sanitize-filename = "0.5.0" +scraper = "0.14.0" +url = "2.5.0" +urlencoding = "2.1.3" diff --git a/README.md b/README.md new file mode 100644 index 0000000..ef20a45 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# KHInsider Downloader + +Downloads full albums from [KHInsider](https://downloads.khinsider.com) by scraping the webpage for the appropriate download URLs. + +Given that this uses web scraping techniques, it is prone to breaking if KHInsider change the way the website looks. If you notice any problems please notify me. diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..abc81da --- /dev/null +++ b/src/main.rs @@ -0,0 +1,118 @@ +use std::fs; +use std::path; + +use anyhow::Context; + +#[derive(clap::Parser)] +struct Args { + /// Name of album as it appears in the URL + /// https://downloads.khinsider.com/game-soundtracks/album/{album_name} + album : String, + /// Output directory for downloads. Will use the album name by default. + output : Option, +} + +fn main() -> anyhow::Result<()> { + let args : Args = clap::Parser::parse(); + + let output = path::PathBuf::from(sanitize_filename::sanitize(args.output.unwrap_or(args.album.clone()))); + if output.exists() { + anyhow::bail!(r#"output path "{}" already exists"#, output.display()) + } else { + fs::create_dir(&output) + .context("failed to create output directory")?; + } + + let album_response = minreq::get( + format!("https://downloads.khinsider.com/game-soundtracks/album/{}", args.album) + ).send().context("error when requesting album webpage")?; + + if album_response.status_code != 200 { + anyhow::bail!("album page responded with non-200 ({}) response code", album_response.status_code) + } + + let album_page = album_response.as_str().context("could not read album page response as a string")?; + + let document = scraper::Html::parse_document(album_page); + let tracks_selector = scraper::Selector::parse("table#songlist > tbody > tr:not(#songlist_header):not(#songlist_footer)").unwrap(); + + let headings_selector = scraper::Selector::parse("table#songlist > tbody > tr#songlist_header > th").unwrap(); + let headings = document.select(&headings_selector).map(|a| a.inner_html()).collect::>(); + + let heading_selector = scraper::Selector::parse("div#pageContent > h2").unwrap(); + let album_name = document.select(&heading_selector).next().unwrap().inner_html(); + + for element in document.select(&tracks_selector) { + + let mut tag = id3::Tag::new(); + use id3::TagLike; + tag.set_album(album_name.clone()); + + let download_link_selector = scraper::Selector::parse("td.playlistDownloadSong > a").unwrap(); + let download_link = element.select(&download_link_selector).next().unwrap(); + + let columns_selector = scraper::Selector::parse("td").unwrap(); + let columns = element.select(&columns_selector).collect::>(); + let track = columns[headings.iter().position(|x| x == "#").unwrap()].inner_html().trim_end_matches(".").parse::().unwrap(); + tag.set_track(track); + if let Some(cd) = headings.iter().position(|x| x == "CD") { + tag.set_disc(columns[cd].inner_html().parse::().unwrap_or(1)); + } + + let track_url = format!( + "https://downloads.khinsider.com{}", + download_link.value().attr("href") + .context("track element did not have media url")?, + ); + + let track_response = minreq::get(track_url) + .send() + .context("error when requesting track webpage")?; + + if track_response.status_code != 200 { + anyhow::bail!("track page responded with non-200 ({}) response code", track_response.status_code) + } + + let track_page = track_response.as_str().context("could not read track page response as a string")?; + + let document = scraper::Html::parse_document(track_page); + let audio_selector = scraper::Selector::parse("audio").unwrap(); + let audio = document.select(&audio_selector).next().unwrap(); + + let meta_selector = scraper::Selector::parse("p[align='left'] > b").unwrap(); + let meta = document.select(&meta_selector); + + let meta = meta.collect::>(); + let song_name = meta[2].inner_html(); + tag.set_title(song_name); + + let audio_url = audio.value().attr("src") + .context("audio tag did not have the expected source attribute")?; + + let audio_url = url::Url::parse(audio_url) + .context("could not parse url for audio file")?; + + let path = audio_url + .path_segments() + .map(|iter| iter.last()) + .flatten() + .map(|name| urlencoding::decode(name).ok()) + .flatten() + .context("failed to parse file name from audio url")?; + + println!("[info] downloading track: {}", path); + + let audio_response = minreq::get(audio_url.as_str()) + .send() + .context("error when requesting audio file")?; + + let mut audio_file = audio_response.as_bytes().to_vec(); + + tag.write_to(&mut audio_file, id3::Version::Id3v24).unwrap(); + + fs::write(output.join(sanitize_filename::sanitize(path.as_ref())), audio_response.as_bytes()) + .context("error writing audio file")?; + } + + Ok(()) +}