initial commit

This commit is contained in:
aaron-jack-manning 2024-01-21 14:20:10 +11:00
commit e980697f57
5 changed files with 1193 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
target/

1086
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

14
Cargo.toml Normal file
View File

@ -0,0 +1,14 @@
[package]
name = "khinsider"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.75"
clap = { version = "4.4.10", features = ["derive"] }
minreq = { version = "2.11.0", features = ["https"] }
sanitize-filename = "0.5.0"
scraper = "0.14.0"
url = "2.5.0"
urlencoding = "2.1.3"

8
README.md Normal file
View File

@ -0,0 +1,8 @@
# KHInsider Downloader
Downloads full albums from [KHInsider](https://downloads.khinsider.com) by scraping the webpage for the appropriate download URLs.
Given that this uses web scraping techniques, it is prone to breaking if KHInsider change the way the website looks. If you notice any problems please notify me.
## To Do
- [ ] Tag MP3s before writing to disk such that music players get the album name, art, and track numbering correct

84
src/main.rs Normal file
View File

@ -0,0 +1,84 @@
use std::fs;
use std::path;
use anyhow::Context;
#[derive(clap::Parser)]
struct Args {
/// Name of album as it appears in the URL
/// https://downloads.khinsider.com/game-soundtracks/album/{album_name}
album : String,
/// Output directory for downloads. Will use the album name by default.
output : Option<String>,
}
fn main() -> anyhow::Result<()> {
let args : Args = clap::Parser::parse();
let output = path::PathBuf::from(sanitize_filename::sanitize(args.output.unwrap_or(args.album.clone())));
if output.exists() {
anyhow::bail!(r#"output path "{}" already exists"#, output.display())
} else {
fs::create_dir(&output)
.context("failed to create output directory")?;
}
let album_response = minreq::get(
format!("https://downloads.khinsider.com/game-soundtracks/album/{}", args.album)
).send().context("error when requesting album webpage")?;
if album_response.status_code != 200 {
anyhow::bail!("album page responded with non-200 ({}) response code", album_response.status_code)
}
let album_page = album_response.as_str().context("could not read album page response as a string")?;
let document = scraper::Html::parse_document(album_page);
let selector = scraper::Selector::parse("table#songlist > tbody > tr > td.playlistDownloadSong > a").unwrap();
for element in document.select(&selector) {
let track_url = format!(
"https://downloads.khinsider.com{}",
element.value().attr("href")
.context("track element did not have media url")?,
);
let track_response = minreq::get(track_url)
.send()
.context("error when requesting track webpage")?;
if track_response.status_code != 200 {
anyhow::bail!("track page responded with non-200 ({}) response code", track_response.status_code)
}
let track_page = track_response.as_str().context("could not read track page response as a string")?;
let document = scraper::Html::parse_document(track_page);
let selector = scraper::Selector::parse("audio").unwrap();
for element in document.select(&selector) {
let audio_url = element.value().attr("src")
.context("audio tag did not have the expected source attribute")?;
let audio_url = url::Url::parse(audio_url)
.context("could not parse url for audio file")?;
let path = audio_url
.path_segments()
.map(|iter| iter.last())
.flatten()
.map(|name| urlencoding::decode(name).ok())
.flatten()
.context("failed to parse file name from audio url")?;
println!("[info] downloading track: {}", path);
let audio_response = minreq::get(audio_url.as_str())
.send()
.context("error when requesting audio file")?;
fs::write(output.join(sanitize_filename::sanitize(path.as_ref())), audio_response.as_bytes())
.context("error writing audio file")?;
}
}
Ok(())
}