initial commit

This commit is contained in:
aaron-jack-manning 2024-01-21 14:20:10 +11:00 committed by Aaron Manning
commit 4fb4948dcc
4 changed files with 138 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
target/

14
Cargo.toml Normal file
View File

@ -0,0 +1,14 @@
[package]
name = "khinsider"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.75"
clap = { version = "4.4.10", features = ["derive"] }
id3 = "1.13.1"
minreq = { version = "2.11.0", features = ["https"] }
sanitize-filename = "0.5.0"
scraper = "0.14.0"
url = "2.5.0"
urlencoding = "2.1.3"

5
README.md Normal file
View File

@ -0,0 +1,5 @@
# KHInsider Downloader
Downloads full albums from [KHInsider](https://downloads.khinsider.com) by scraping the webpage for the appropriate download URLs.
Given that this uses web scraping techniques, it is prone to breaking if KHInsider change the way the website looks. If you notice any problems please notify me.

118
src/main.rs Normal file
View File

@ -0,0 +1,118 @@
use std::fs;
use std::path;
use anyhow::Context;
#[derive(clap::Parser)]
struct Args {
/// Name of album as it appears in the URL
/// https://downloads.khinsider.com/game-soundtracks/album/{album_name}
album : String,
/// Output directory for downloads. Will use the album name by default.
output : Option<String>,
}
fn main() -> anyhow::Result<()> {
let args : Args = clap::Parser::parse();
let output = path::PathBuf::from(sanitize_filename::sanitize(args.output.unwrap_or(args.album.clone())));
if output.exists() {
anyhow::bail!(r#"output path "{}" already exists"#, output.display())
} else {
fs::create_dir(&output)
.context("failed to create output directory")?;
}
let album_response = minreq::get(
format!("https://downloads.khinsider.com/game-soundtracks/album/{}", args.album)
).send().context("error when requesting album webpage")?;
if album_response.status_code != 200 {
anyhow::bail!("album page responded with non-200 ({}) response code", album_response.status_code)
}
let album_page = album_response.as_str().context("could not read album page response as a string")?;
let document = scraper::Html::parse_document(album_page);
let tracks_selector = scraper::Selector::parse("table#songlist > tbody > tr:not(#songlist_header):not(#songlist_footer)").unwrap();
let headings_selector = scraper::Selector::parse("table#songlist > tbody > tr#songlist_header > th").unwrap();
let headings = document.select(&headings_selector).map(|a| a.inner_html()).collect::<Vec<_>>();
let heading_selector = scraper::Selector::parse("div#pageContent > h2").unwrap();
let album_name = document.select(&heading_selector).next().unwrap().inner_html();
for element in document.select(&tracks_selector) {
let mut tag = id3::Tag::new();
use id3::TagLike;
tag.set_album(album_name.clone());
let download_link_selector = scraper::Selector::parse("td.playlistDownloadSong > a").unwrap();
let download_link = element.select(&download_link_selector).next().unwrap();
let columns_selector = scraper::Selector::parse("td").unwrap();
let columns = element.select(&columns_selector).collect::<Vec<_>>();
let track = columns[headings.iter().position(|x| x == "<b>#</b>").unwrap()].inner_html().trim_end_matches(".").parse::<u32>().unwrap();
tag.set_track(track);
if let Some(cd) = headings.iter().position(|x| x == "<b>CD</b>") {
tag.set_disc(columns[cd].inner_html().parse::<u32>().unwrap_or(1));
}
let track_url = format!(
"https://downloads.khinsider.com{}",
download_link.value().attr("href")
.context("track element did not have media url")?,
);
let track_response = minreq::get(track_url)
.send()
.context("error when requesting track webpage")?;
if track_response.status_code != 200 {
anyhow::bail!("track page responded with non-200 ({}) response code", track_response.status_code)
}
let track_page = track_response.as_str().context("could not read track page response as a string")?;
let document = scraper::Html::parse_document(track_page);
let audio_selector = scraper::Selector::parse("audio").unwrap();
let audio = document.select(&audio_selector).next().unwrap();
let meta_selector = scraper::Selector::parse("p[align='left'] > b").unwrap();
let meta = document.select(&meta_selector);
let meta = meta.collect::<Vec<_>>();
let song_name = meta[2].inner_html();
tag.set_title(song_name);
let audio_url = audio.value().attr("src")
.context("audio tag did not have the expected source attribute")?;
let audio_url = url::Url::parse(audio_url)
.context("could not parse url for audio file")?;
let path = audio_url
.path_segments()
.map(|iter| iter.last())
.flatten()
.map(|name| urlencoding::decode(name).ok())
.flatten()
.context("failed to parse file name from audio url")?;
println!("[info] downloading track: {}", path);
let audio_response = minreq::get(audio_url.as_str())
.send()
.context("error when requesting audio file")?;
let mut audio_file = audio_response.as_bytes().to_vec();
tag.write_to(&mut audio_file, id3::Version::Id3v24).unwrap();
fs::write(output.join(sanitize_filename::sanitize(path.as_ref())), audio_response.as_bytes())
.context("error writing audio file")?;
}
Ok(())
}